diff --git a/.catalog-onboard-pipeline.yaml b/.catalog-onboard-pipeline.yaml index 4c9003c4..d89fea83 100644 --- a/.catalog-onboard-pipeline.yaml +++ b/.catalog-onboard-pipeline.yaml @@ -2,10 +2,10 @@ apiVersion: v1 offerings: # below is an example of a Deployable Architecture (DA) solution -- name: deploy-arch-ibm-hpc-lsf # must match the offering name in the ibm_catalog.json +- name: deploy-arch-ibm-hpc # must match the offering name in the ibm_catalog.json kind: solution - catalog_id: 0d89ec0d-d39a-494d-ac5b-9d940d8cc65f - offering_id: 1444e20a-af22-40d1-af98-c880918849cb + catalog_id: 8611e025-10b2-488e-8261-a7f584a5114b + offering_id: bf3c07f8-5a62-4289-8ea0-94dbb2b410e6 # list all of the variations (flavors) you have included in the ibm_catalog.json variations: - name: Cluster-with-LSF diff --git a/.cra/.fileignore b/.cra/.fileignore new file mode 100644 index 00000000..5cce52bc --- /dev/null +++ b/.cra/.fileignore @@ -0,0 +1,3 @@ +# ignore temporary copies of referenced repos +.terraform +common-dev-assets diff --git a/.gitignore b/.gitignore index aad210b0..3e5c8e48 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ localtweak__*.tf # tests folder log file *.log + +# Ignore RSA files +*id_rsa diff --git a/.secrets.baseline b/.secrets.baseline index 4ce66286..e72e0cdb 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,9 +1,9 @@ { "exclude": { - "files": "acceptance-test/package-lock.json|test/go.sum|go.sum|^.secrets.baseline$", + "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2025-02-12T12:53:44Z", + "generated_at": "2025-06-19T07:38:57Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -76,7 +76,34 @@ "name": "TwilioKeyDetector" } ], - "results": {}, + "results": { + "samples/configs/hpc_catalog_values.json": [ + { + "hashed_secret": "5073c7ac17500ef0678aebc7138a996b4f75d623", + "is_secret": true, + "is_verified": false, + "line_number": 8, + "type": "Secret Keyword", + "verified_result": null + }, + { + "hashed_secret": "1f5e25be9b575e9f5d39c82dfd1d9f4d73f1975c", + "is_secret": true, + "is_verified": false, + "line_number": 37, + "type": "Secret Keyword", + "verified_result": null + }, + { + "hashed_secret": "b295b04949a98dc50ba65adcddd588077b93ab3c", + "is_secret": true, + "is_verified": false, + "line_number": 60, + "type": "Secret Keyword", + "verified_result": null + } + ] + }, "version": "0.13.1+ibm.62.dss", "word_list": { "file": null, diff --git a/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/listener-git-pr-status.yaml b/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/listener-git-pr-status.yaml new file mode 100644 index 00000000..8e2a7d12 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/listener-git-pr-status.yaml @@ -0,0 +1,180 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerTemplate +args: [--allow-multiple-documents] +metadata: + name: triggertemplate-git-pr-status +spec: + params: + - name: git_access_token + description: the token to access the git repository for the clone operations + - name: repository + description: The git repo + default: " " + - name: branch + description: the branch for the git repo + - name: directory-name + default: "." + - name: pr-repository + description: The source git repo for the PullRequest + default: " " + - name: pr-branch + description: The source branch for the PullRequest + default: " " + - name: pr-revision + description: the commit id/sha for the PullRequest + default: " " + - name: triggerName + default: "git-pr-process" + - name: pipeline-debug + default: "0" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: compute_image_name_ubuntu + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + resourcetemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: $(params.triggerName)-$(uid)-pvc + spec: + resources: + requests: + storage: 5Gi + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + - apiVersion: tekton.dev/v1beta1 + kind: PipelineRun + metadata: + name: $(params.triggerName)-$(uid) + spec: + pipelineRef: + name: pipeline-git-pr-status + params: + - name: git_access_token + value: $(params.git_access_token) + - name: repository + value: $(params.repository) + - name: branch + value: $(params.branch) + - name: pr-repository + value: $(params.pr-repository) + - name: pr-branch + value: $(params.pr-branch) + - name: pr-revision + value: $(params.pr-revision) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: directory-name + value: $(params.directory-name) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: compute_image_name_ubuntu + value: $(params.compute_image_name_ubuntu) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + workspaces: + - name: pipeline-ws + persistentVolumeClaim: + claimName: $(params.triggerName)-$(uid)-pvc +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerBinding +metadata: + name: triggerbinding-git-pr-status-github-pr +spec: + params: + - name: repository + value: "$(event.pull_request.base.repo.clone_url)" + - name: branch + value: "$(event.pull_request.base.ref)" + - name: pr-repository + value: "$(event.pull_request.head.repo.clone_url)" + - name: pr-branch + value: "$(event.pull_request.head.ref)" + - name: pr-revision + value: "$(event.pull_request.head.sha)" + - name: triggerName + value: "github-pullrequest" +--- +apiVersion: tekton.dev/v1beta1 +kind: EventListener +metadata: + name: eventlistener-git-pr-status-github-pr +spec: + triggers: + - binding: + name: triggerbinding-git-pr-status-github-pr + template: + name: triggertemplate-git-pr-status diff --git a/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/lsf-pipeline-git-pr-status.yaml b/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/lsf-pipeline-git-pr-status.yaml new file mode 100644 index 00000000..b443daa1 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf-da-pr-pipeline/lsf-pipeline-git-pr-status.yaml @@ -0,0 +1,334 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Pipeline +metadata: + name: pipeline-git-pr-status +spec: + params: + - name: repository + description: the git repo + - name: branch + description: the branch for the git repo + - name: pr-repository + description: The source git repo for the PullRequest + default: "" + - name: pr-branch + description: The source branch for the PullRequest + default: "" + - name: pr-revision + description: the commit id/sha for the PullRequest + default: "" + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: properties-file + default: "output/thebuild.properties" + - name: git-credentials-json-file + default: "output/secrets/thecredentials.json" + - name: context + default: "commit message check" + - name: description + default: "verify the commit message" + - name: pipeline-debug + default: "0" + - name: directory-name + default: "." + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: compute_image_name_ubuntu + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: pipeline-ws + tasks: + - name: set-git-pr-pending + taskRef: + name: git-set-commit-status + workspaces: + - name: artifacts + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: revision + value: $(params.pr-revision) + - name: context + value: $(params.context) + - name: description + value: $(params.description) + - name: state + value: "pending" + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: git-clone + taskRef: + name: git-clone-repo + runAfter: [set-git-pr-pending] + params: + - name: repository + value: $(params.repository) + - name: branch + value: $(params.branch) + - name: pr-repository + value: $(params.pr-repository) + - name: pr-branch + value: $(params.pr-branch) + - name: pr-revision + value: $(params.pr-revision) + - name: git_access_token + value: $(params.git_access_token) + - name: directory-name + value: $(params.directory-name) + - name: properties-file + value: $(params.properties-file) + - name: git-credentials-json-file + value: $(params.git-credentials-json-file) + - name: pipeline-debug + value: $(params.pipeline-debug) + workspaces: + - name: output + workspace: pipeline-ws + - name: set-git-pr-running + runAfter: [git-clone] + taskRef: + name: git-set-commit-status + workspaces: + - name: artifacts + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: revision + value: $(params.pr-revision) + - name: context + value: $(params.context) + - name: description + value: $(params.description) + - name: state + value: "running" + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: pre-requisites-install + runAfter: [git-clone] + taskRef: + name: pre-requisites-install + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: repository + value: $(params.repository) + - name: ssh-key-creation + runAfter: [git-clone, pre-requisites-install] + taskRef: + name: ssh-key-creation + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: resource_group + value: $(params.resource_group) + - name: pr-revision + value: $(params.pr-revision) + - name: solution + value: $(params.solution) + - name: wes-lsf-da-rhel-pr + runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + taskRef: + name: wes-lsf-da-rhel-pr + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: git_access_token + value: $(params.git_access_token) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: pr-revision + value: $(params.pr-revision) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + # - name: wes-lsf-da-ubuntu-pr + # runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + # taskRef: + # name: wes-lsf-da-ubuntu-pr + # workspaces: + # - name: workspace + # workspace: pipeline-ws + # params: + # - name: repository + # value: $(params.repository) + # - name: git_access_token + # value: $(params.git_access_token) + # - name: pipeline-debug + # value: $(params.pipeline-debug) + # - name: zone + # value: $(params.zone) + # - name: resource_group + # value: $(params.resource_group) + # - name: compute_image_name_ubuntu + # value: $(params.compute_image_name_ubuntu) + # - name: login_image_name + # value: $(params.login_image_name) + # - name: management_image_name + # value: $(params.management_image_name) + # - name: pr-revision + # value: $(params.pr-revision) + # - name: cos_region + # value: $(params.cos_region) + # - name: cos_bucket + # value: $(params.cos_bucket) + # - name: cos_instance_crn + # value: $(params.cos_instance_crn) + # - name: cos_api_key + # value: $(params.cos_api_key) + # - name: hpc_custom_reports_repo + # value: $(params.hpc_custom_reports_repo) + # - name: hpc_custom_reports_branch + # value: $(params.hpc_custom_reports_branch) + # - name: git_user_name + # value: $(params.git_user_name) + # - name: git_user_email + # value: $(params.git_user_email) + - name: ssh-key-deletion + runAfter: [wes-lsf-da-rhel-pr] + taskRef: + name: ssh-key-deletion + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: pr-revision + value: $(params.pr-revision) + - name: solution + value: $(params.solution) + - name: error-check-on-lsf-pr-infra-log + runAfter: [git-clone, wes-lsf-da-rhel-pr] + workspaces: + - name: workspace + workspace: pipeline-ws + taskSpec: + workspaces: + - name: workspace + description: The git repo will be cloned onto the volume backing this workspace + mountPath: /artifacts + steps: + - name: error-pr-rhel-suite + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-pr-rhel-suite.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" "PR" + # - name: error-pr-ubuntu-suite + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="pipeline-pr-ubuntu-suite.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "PR" + finally: + - name: set-git-commit-status + taskRef: + name: git-set-commit-status + workspaces: + - name: artifacts + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: revision + value: $(params.pr-revision) + - name: context + value: $(params.context) + - name: description + value: $(params.description) + - name: state + value: "$(tasks.error-check-on-lsf-da-pr-infra-log.status)" + - name: pipeline-debug + value: $(params.pipeline-debug) diff --git a/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/listener-git-trigger.yaml b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/listener-git-trigger.yaml new file mode 100644 index 00000000..dcbd560d --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/listener-git-trigger.yaml @@ -0,0 +1,290 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerTemplate +args: [--allow-multiple-documents] +metadata: + name: triggertemplate-git-trigger +spec: + params: + - name: git_access_token + description: the token to access the git repository for the clone operations + - name: repository + description: The git repo + default: " " + - name: branch + description: the branch for the git repo + - name: revision + description: the commit id/sha for the clone action + default: " " + - name: pr-repository + description: The source git repo for the PullRequest + default: " " + - name: pr-branch + description: The source branch for the PullRequest + default: " " + - name: pr-revision + description: the commit id/sha for the PullRequest + default: " " + - name: directory-name + default: "." + - name: triggerName + default: "git-pr-process" + - name: pipeline-debug + default: "0" + - name: state + default: "success" + - name: description + default: "The status of tekton commit" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: compute_image_name_ubuntu + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + - name: pac_ha_exist_certificate + description: PAC HA Existing Certificate + default: "" + resourcetemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: $(params.triggerName)-$(uid)-pvc + spec: + resources: + requests: + storage: 5Gi + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + - apiVersion: tekton.dev/v1beta1 + kind: PipelineRun + metadata: + name: $(params.triggerName)-$(uid) + spec: + pipelineRef: + name: pipeline-git-event-processing + params: + - name: git_access_token + value: $(params.git_access_token) + - name: directory-name + value: $(params.directory-name) + - name: repository + value: $(params.repository) + - name: branch + value: $(params.branch) + - name: revision + value: $(params.revision) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: state + value: $(params.state) + - name: description + value: $(params.description) + - name: resource_group + value: $(params.resource_group) + - name: zone + value: $(params.zone) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: management_image_name + value: $(params.management_image_name) + - name: login_image_name + value: $(params.login_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: solution + value: $(params.solution) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: pac_ha_exist_certificate + value: $(params.pac_ha_exist_certificate) + workspaces: + - name: pipeline-ws + persistentVolumeClaim: + claimName: $(params.triggerName)-$(uid)-pvc +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerBinding +metadata: + name: triggerbinding-git-trigger-manual +spec: + params: + - name: repository + value: $(params.repository) + - name: branch + value: $(params.branch) + - name: triggerName + value: manual-trigger + - name: resource_group + value: $(params.resource_group) + - name: zone + value: $(params.zone) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: management_image_name + value: $(params.management_image_name) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + - name: pac_ha_exist_certificate + value: $(params.pac_ha_exist_certificate) +--- +apiVersion: tekton.dev/v1beta1 +kind: EventListener +metadata: + name: eventlistener-git-trigger-manual +spec: + triggers: + - binding: + name: triggerbinding-git-trigger-manual + template: + name: triggertemplate-git-trigger +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerBinding +metadata: + name: triggerbinding-git-trigger-github-pr +spec: + params: + - name: repository + value: "$(event.pull_request.base.repo.clone_url)" + - name: branch + value: "$(event.pull_request.base.ref)" + - name: pr-repository + value: "$(event.pull_request.head.repo.clone_url)" + - name: pr-branch + value: "$(event.pull_request.head.ref)" + - name: pr-revision + value: "$(event.pull_request.head.sha)" + - name: triggerName + value: "github-pullrequest" +--- +apiVersion: tekton.dev/v1beta1 +kind: TriggerBinding +metadata: + name: triggerbinding-git-trigger-github-commit +spec: + params: + - name: triggerName + value: "github-commit" + - name: repository + value: "$(event.repository.url)" + - name: revision + value: "$(event.head_commit.id)" + - name: branch + value: "$(event.ref)" + - name: resource_group + value: $(event.ref) + - name: compute_image_name_rhel + value: $(event.ref) + - name: management_image_name + value: $(event.ref) + - name: login_image_name + value: $(event.ref) + - name: hpc_custom_reports_repo + value: $(event.ref) + - name: hpc_custom_reports_branch + value: $(event.ref) + - name: git_user_name + value: $(event.ref) + - name: git_user_email + value: $(event.ref) + - name: solution + value: $(event.ref) + - name: pac_ha_exist_certificate + value: $(event.ref) +--- +apiVersion: tekton.dev/v1beta1 +kind: EventListener +metadata: + name: eventlistener-git-trigger-github-pr +spec: + triggers: + - binding: + name: triggerbinding-git-trigger-github-pr + template: + name: triggertemplate-git-trigger +--- +apiVersion: tekton.dev/v1beta1 +kind: EventListener +metadata: + name: eventlistener-git-trigger-github-commit +spec: + triggers: + - binding: + name: triggerbinding-git-trigger-github-commit + template: + name: triggertemplate-git-trigger diff --git a/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/lsf-pipeline-git-trigger.yaml b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/lsf-pipeline-git-trigger.yaml new file mode 100644 index 00000000..77b8b482 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/lsf-pipeline-git-trigger.yaml @@ -0,0 +1,991 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Pipeline +metadata: + name: pipeline-git-event-processing +spec: + params: + - name: repository + description: the git repo + - name: branch + description: the branch for the git repo + - name: revision + description: the commit id/sha for the clone action + default: "" + - name: state + - name: description + default: "The status of tekton commit" + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: properties-file + default: "output/thebuild.properties" + - name: git-credentials-json-file + default: "output/secrets/thecredentials.json" + - name: directory-name + default: "." + - name: pipeline-debug + default: "0" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: compute_image_name_ubuntu + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + - name: pac_ha_exist_certificate + description: PAC HA Existing Certificate + default: "" + workspaces: + - name: pipeline-ws + tasks: + - name: git-clone + taskRef: + name: git-clone-repo + params: + - name: repository + value: $(params.repository) + - name: branch + value: $(params.branch) + - name: revision + value: $(params.revision) + - name: git_access_token + value: $(params.git_access_token) + - name: directory-name + value: "$(params.directory-name)" + - name: properties-file + value: $(params.properties-file) + - name: git-credentials-json-file + value: $(params.git-credentials-json-file) + - name: pipeline-debug + value: $(params.pipeline-debug) + workspaces: + - name: output + workspace: pipeline-ws + - name: pre-requisites-install + runAfter: [git-clone] + taskRef: + name: pre-requisites-install + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: repository + value: $(params.repository) + - name: ssh-key-creation + runAfter: [git-clone, pre-requisites-install] + taskRef: + name: ssh-key-creation + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: resource_group + value: $(params.resource_group) + - name: revision + value: $(params.revision) + - name: solution + value: $(params.solution) + - name: wes-lsf-da-rhel-1 + runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + taskRef: + name: wes-lsf-da-rhel-1 + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: git_access_token + value: $(params.git_access_token) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: compute_image_name_ubuntu + value: $(params.compute_image_name_ubuntu) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: revision + value: $(params.revision) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + - name: wes-lsf-da-rhel-2 + runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + taskRef: + name: wes-lsf-da-rhel-2 + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: git_access_token + value: $(params.git_access_token) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: compute_image_name_ubuntu + value: $(params.compute_image_name_ubuntu) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: revision + value: $(params.revision) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + - name: wes-lsf-da-rhel-3 + runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + taskRef: + name: wes-lsf-da-rhel-3 + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: git_access_token + value: $(params.git_access_token) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: compute_image_name_ubuntu + value: $(params.compute_image_name_ubuntu) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: revision + value: $(params.revision) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + - name: wes-lsf-da-rhel-4 + runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + taskRef: + name: wes-lsf-da-rhel-4 + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(params.repository) + - name: git_access_token + value: $(params.git_access_token) + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: compute_image_name_ubuntu + value: $(params.compute_image_name_ubuntu) + - name: login_image_name + value: $(params.login_image_name) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: revision + value: $(params.revision) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: solution + value: $(params.solution) + - name: pac_ha_exist_certificate + value: $(params.pac_ha_exist_certificate) + # # - name: wes-lsf-da-ubuntu + # # runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + # # taskRef: + # # name: wes-lsf-da-ubuntu + # # workspaces: + # # - name: workspace + # # workspace: pipeline-ws + # # params: + # # - name: repository + # # value: $(params.repository) + # # - name: git_access_token + # # value: $(params.git_access_token) + # # - name: pipeline-debug + # # value: $(params.pipeline-debug) + # # - name: zone + # # value: $(params.zone) + # # - name: resource_group + # # value: $(params.resource_group) + # # - name: compute_image_name_rhel + # # value: $(params.compute_image_name_rhel) + # # - name: compute_image_name_ubuntu + # # value: $(params.compute_image_name_ubuntu) + # # - name: login_image_name + # # value: $(params.login_image_name) + # # - name: management_image_name + # # value: $(params.management_image_name) + # # - name: revision + # # value: $(params.revision) + # # - name: cos_region + # # value: $(params.cos_region) + # # - name: cos_bucket + # # value: $(params.cos_bucket) + # # - name: cos_instance_crn + # # value: $(params.cos_instance_crn) + # # - name: cos_api_key + # # value: $(params.cos_api_key) + # # - name: hpc_custom_reports_repo + # # value: $(params.hpc_custom_reports_repo) + # # - name: hpc_custom_reports_branch + # # value: $(params.hpc_custom_reports_branch) + # # - name: git_user_name + # # value: $(params.git_user_name) + # # - name: git_user_email + # # value: $(params.git_user_email) + # - name: wes-lsf-da-region + # runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + # taskRef: + # name: wes-lsf-da-region + # workspaces: + # - name: workspace + # workspace: pipeline-ws + # params: + # - name: repository + # value: $(params.repository) + # - name: git_access_token + # value: $(params.git_access_token) + # - name: pipeline-debug + # value: $(params.pipeline-debug) + # - name: zone + # value: $(params.zone) + # - name: resource_group + # value: $(params.resource_group) + # - name: compute_image_name_rhel + # value: $(params.compute_image_name_rhel) + # - name: compute_image_name_ubuntu + # value: $(params.compute_image_name_ubuntu) + # - name: login_image_name + # value: $(params.login_image_name) + # - name: management_image_name + # value: $(params.management_image_name) + # - name: deployer_image_name + # value: $(params.deployer_image_name) + # - name: revision + # value: $(params.revision) + # - name: cos_region + # value: $(params.cos_region) + # - name: cos_bucket + # value: $(params.cos_bucket) + # - name: cos_instance_crn + # value: $(params.cos_instance_crn) + # - name: cos_api_key + # value: $(params.cos_api_key) + # - name: hpc_custom_reports_repo + # value: $(params.hpc_custom_reports_repo) + # - name: hpc_custom_reports_branch + # value: $(params.hpc_custom_reports_branch) + # - name: git_user_name + # value: $(params.git_user_name) + # - name: git_user_email + # value: $(params.git_user_email) + # - name: solution + # value: $(params.solution) + # - name: wes-lsf-da-negative + # runAfter: [git-clone, pre-requisites-install, ssh-key-creation] + # taskRef: + # name: wes-lsf-da-negative + # workspaces: + # - name: workspace + # workspace: pipeline-ws + # params: + # - name: repository + # value: $(params.repository) + # - name: git_access_token + # value: $(params.git_access_token) + # - name: pipeline-debug + # value: $(params.pipeline-debug) + # - name: zone + # value: $(params.zone) + # - name: resource_group + # value: $(params.resource_group) + # - name: compute_image_name_rhel + # value: $(params.compute_image_name_rhel) + # - name: compute_image_name_ubuntu + # value: $(params.compute_image_name_ubuntu) + # - name: login_image_name + # value: $(params.login_image_name) + # - name: management_image_name + # value: $(params.management_image_name) + # - name: deployer_image_name + # value: $(params.deployer_image_name) + # - name: revision + # value: $(params.revision) + # - name: cos_region + # value: $(params.cos_region) + # - name: cos_bucket + # value: $(params.cos_bucket) + # - name: cos_instance_crn + # value: $(params.cos_instance_crn) + # - name: cos_api_key + # value: $(params.cos_api_key) + # - name: hpc_custom_reports_repo + # value: $(params.hpc_custom_reports_repo) + # - name: hpc_custom_reports_branch + # value: $(params.hpc_custom_reports_branch) + # - name: git_user_name + # value: $(params.git_user_name) + # - name: git_user_email + # value: $(params.git_user_email) + - name: ssh-key-deletion + runAfter: [ + wes-lsf-da-rhel-1, + wes-lsf-da-rhel-2, + wes-lsf-da-rhel-3, + wes-lsf-da-rhel-4, + # wes-lsf-da-region, + # wes-lsf-da-negative, + ] + taskRef: + name: ssh-key-deletion + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: pipeline-debug + value: $(params.pipeline-debug) + - name: revision + value: $(params.revision) + - name: solution + value: $(params.solution) + - name: git-content-inspect + runAfter: [ + wes-lsf-da-rhel-1, + wes-lsf-da-rhel-2, + wes-lsf-da-rhel-3, + wes-lsf-da-rhel-4, + # wes-lsf-da-region, + # wes-lsf-da-negative, + ] + taskRef: + name: inspect-git-content + workspaces: + - name: workspace + workspace: pipeline-ws + params: + - name: repository + value: $(tasks.git-clone.results.git-repository) + - name: directory-name + value: $(tasks.git-clone.results.clone-directory) + - name: properties-file + value: $(params.properties-file) + - name: git-credentials-json-file + value: $(params.git-credentials-json-file) + - name: git-branch + value: $(tasks.git-clone.results.git-branch) + - name: git-commit + value: $(tasks.git-clone.results.git-commit) + - name: git-user + value: $(tasks.git-clone.results.git-user) + - name: display-validation-logs + runAfter: [ + wes-lsf-da-rhel-1, + wes-lsf-da-rhel-2, + wes-lsf-da-rhel-3, + wes-lsf-da-rhel-4, + # wes-lsf-da-region, + # wes-lsf-da-negative, + ] + workspaces: + - name: workspace + workspace: pipeline-ws + taskSpec: + workspaces: + - name: workspace + description: The git repo will be cloned onto the volume backing this workspace + mountPath: /artifacts + steps: + - name: validation-lsf-da-rhel-suite-1 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-1.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-2 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-2.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-3 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-3.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-4 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-4.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-5 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-5.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-6 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-6.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-7 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-7.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: validation-lsf-da-rhel-suite-8 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-8.log" + source .tekton/scripts/issue_track.sh + display_validation_log "${VALIDATION_LOG_FILE_NAME}" + + # - name: validation-lsf-da-rhel-suite-9 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-9.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-rhel-suite-10 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-10.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-rhel-suite-11 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-rhel-suite-11.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # # - name: validation-lsf-ubuntu-suite + # # onError: continue + # # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # # workingDir: "/artifacts" + # # command: ["/bin/bash", "-c"] + # # args: + # # - | + # # #!/bin/bash + # # VALIDATION_LOG_FILE_NAME="lsf-da-ubuntu-suite.log" + # # source .tekton/scripts/issue_track.sh + # # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-regions-suite + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-regions-suite.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-negative-suite-1 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-negative-suite-1.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-negative-suite-2 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-negative-suite-2.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-negative-suite-3 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-negative-suite-3.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-negative-suite-4 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-negative-suite-4.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + # - name: validation-lsf-da-negative-suite-5 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # VALIDATION_LOG_FILE_NAME="lsf-da-negative-suite-5.log" + # source .tekton/scripts/issue_track.sh + # display_validation_log "${VALIDATION_LOG_FILE_NAME}" + - name: display-lsf-da-infra-logs + runAfter: [ + wes-lsf-da-rhel-1, + wes-lsf-da-rhel-2, + wes-lsf-da-rhel-3, + wes-lsf-da-rhel-4, + # wes-lsf-da-region, + # wes-lsf-da-negative, + ] + workspaces: + - name: workspace + workspace: pipeline-ws + taskSpec: + workspaces: + - name: workspace + description: The git repo will be cloned onto the volume backing this workspace + mountPath: /artifacts + steps: + - name: display-infra-da-log-rhel-suite-1 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-1.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-2 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-2.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-3 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-3.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-4 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-4.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-5 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-5.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-6 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-6.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-7 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-7.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + - name: display-infra-da-log-rhel-suite-8 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + LOG_FILE_NAME="lsf-da-rhel-suite-8.json" + source .tekton/scripts/issue_track.sh + issue_track "${LOG_FILE_NAME}" + # - name: display-infra-da-log-rhel-suite-9 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-rhel-suite-9.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" + # - name: display-infra-da-log-rhel-suite-10 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-rhel-suite-10.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" + # - name: display-infra-da-log-rhel-suite-11 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-rhel-suite-11.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" + # # - name: display-infra-log-ubuntu-suite + # # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # # workingDir: "/artifacts" + # # command: ["/bin/bash", "-c"] + # # args: + # # - | + # # #!/bin/bash + # # LOG_FILE_NAME="pipeline-ubuntu-suite.json" + # # source .tekton/scripts/issue_track.sh + # # issue_track "${LOG_FILE_NAME}" + # - name: display-infra-log-regions-suite + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-regions-suite.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" + # - name: display-infra-log-negative-suite-1 + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-negative-suite-1.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "negative_suite" + # - name: display-infra-log-negative-suite-2 + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-negative-suite-2.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "negative_suite" + # - name: display-infra-log-negative-suite-3 + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-negative-suite-3.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "negative_suite" + # - name: display-infra-log-negative-suite-4 + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-negative-suite-4.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "negative_suite" + # - name: display-infra-log-negative-suite-5 + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # onError: continue + # workingDir: "/artifacts" + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + # LOG_FILE_NAME="lsf-da-negative-suite-5.json" + # source .tekton/scripts/issue_track.sh + # issue_track "${LOG_FILE_NAME}" "negative_suite" + - name: error-check-on-lsf-da-infra-logs + runAfter: [display-validation-logs, display-lsf-da-infra-logs] + workspaces: + - name: workspace + workspace: pipeline-ws + taskSpec: + workspaces: + - name: workspace + description: The git repo will be cloned onto the volume backing this workspace + mountPath: /artifacts + steps: + - name: status-on-infra-log + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + source .tekton/scripts/issue_track.sh + DIRECTORY="/artifacts/tests/lsf_tests" + pattern="*.json" + error_check_on_all_file "${DIRECTORY}" "${pattern}" "infra" + - name: error-check-on-lsf-validation-logs + runAfter: [display-validation-logs, display-lsf-da-infra-logs] + workspaces: + - name: workspace + workspace: pipeline-ws + taskSpec: + workspaces: + - name: workspace + description: The git repo will be cloned onto the volume backing this workspace + mountPath: /artifacts + steps: + - name: status-on-validation-log + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + source .tekton/scripts/issue_track.sh + DIRECTORY="/artifacts/tests/logs_output" + pattern="*.log" + error_check_on_all_file "${DIRECTORY}" "${pattern}" "validation" diff --git a/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/task-inspect-git-content.yaml b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/task-inspect-git-content.yaml new file mode 100644 index 00000000..a437becd --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf-da-regression-pipeline/task-inspect-git-content.yaml @@ -0,0 +1,81 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: inspect-git-content +spec: + params: + - name: repository + description: the git repo url + - name: directory-name + default: "." + - name: properties-file + default: build.properties + - name: git-credentials-json-file + default: "" + - name: git-branch + description: The active branch for the repository + - name: git-commit + description: The current commit id that was cloned + - name: git-user + description: The auth user that cloned the repository + workspaces: + - name: workspace + mountPath: /artifacts + steps: + - name: inspect-git-content + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:3.29 + env: + - name: REPOSITORY + value: $(params.repository) + - name: DIRECTORY_NAME + value: $(params.directory-name) + workingDir: /artifacts + command: ["/bin/sh", "-c"] + args: + - | + cd "$DIRECTORY_NAME" + pwd + # show the git content + echo "Executing 'git show-branch --all'" + git show-branch --all + echo "" + # show the directory content recursively + echo "##############" + ls -l -R + echo "" + # show the README.md content + echo "##############" + echo "Executing 'cat README.md'" + cat README.md + echo "" + echo "##############" + echo "Executing 'cat $(workspaces.workspace.path)/$(params.properties-file)'" + cat $(workspaces.workspace.path)/$(params.properties-file) + echo "" + if [ "$(params.git-credentials-json-file)" ]; then + echo "##############" + echo "Executing 'jq $(workspaces.workspace.path)/$(params.git-credentials-json-file)'" + cat $(workspaces.workspace.path)/$(params.git-credentials-json-file) | jq '. | ."GIT_TOKEN"=""' + fi + if [ -z "$GIT_TOKEN" ]; then + AUTHTYPE=$(jq -r --arg git_repo "$REPOSITORY" \ + '.services[] | select (.parameters.repo_url==$git_repo) | .parameters.auth_type' \ + /cd-config/toolchain.json) + if [[ "${AUTHTYPE}" == "pat" ]]; then + TOKEN=$(jq -r --arg git_repo "$REPOSITORY" \ + '.services[] | select (.parameters.repo_url==$git_repo) | .parameters.api_token' \ + /cd-config/toolchain.json) + if [[ "${TOKEN}" ]]; then + echo "Using access token from toolchain" + GIT_TOKEN="${TOKEN}" + fi + fi + fi + echo "##############" + echo "Showing task inputs:" + echo "params.repository: $(params.repository)" + echo "params.git-branch: $(params.git-branch)" + echo "params.git-commit: $(params.git-commit)" + echo "params.git-user: $(params.git-user)" + echo "params.directory-name: $(params.directory-name)" diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-1.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-1.yaml new file mode 100644 index 00000000..64f90450 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-1.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-rhel-1 +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + steps: + - name: rhel-suite-1 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_1 + - name: rhel-suite-2 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_2 diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-2.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-2.yaml new file mode 100644 index 00000000..3ef25751 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-2.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-rhel-2 +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + steps: + - name: rhel-suite-3 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_3 + - name: rhel-suite-4 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_4 diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-3.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-3.yaml new file mode 100644 index 00000000..7660f361 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-3.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-rhel-3 +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + steps: + - name: rhel-suite-5 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_5 + - name: rhel-suite-6 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_6 diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-4.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-4.yaml new file mode 100644 index 00000000..0dfd92c6 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-infra-rhel-4.yaml @@ -0,0 +1,190 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-rhel-4 +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + - name: pac_ha_exist_certificate + description: PAC HA Existing Certificate + default: "" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + - name: pac_ha_exist_certificate + value: $(params.pac_ha_exist_certificate) + steps: + - name: rhel-suite-7 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_7 + - name: rhel-suite-8 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_8 + # - name: rhel-suite-9 + # onError: continue + # image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + # workingDir: "/artifacts" + # imagePullPolicy: Always + # command: ["/bin/bash", "-c"] + # args: + # - | + # #!/bin/bash + + # if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + # pwd + # env + # trap env EXIT + # set -x + # fi + + # source .tekton/scripts/suites.sh + # lsf_da_rhel_suite_9 diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-negative.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-negative.yaml new file mode 100644 index 00000000..628b9afe --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-negative.yaml @@ -0,0 +1,247 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-negative +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + steps: + - name: negative-suite-1 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + echo "${hpc_custom_reports_repo}" + lsf_negative_suite_1 + - name: negative-suite-2 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + echo "${hpc_custom_reports_repo}" + lsf_negative_suite_2 + - name: negative-suite-3 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + echo "${hpc_custom_reports_repo}" + lsf_negative_suite_3 + - name: negative-suite-4 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + echo "${hpc_custom_reports_repo}" + lsf_negative_suite_4 + - name: negative-suite-5 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + echo "${hpc_custom_reports_repo}" + lsf_negative_suite_5 + - name: rhel-suite-11 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_11 diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-pr-rhel.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-pr-rhel.yaml new file mode 100644 index 00000000..9f318bfc --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-pr-rhel.yaml @@ -0,0 +1,149 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-rhel-pr +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: pr-branch + description: The source branch for the PullRequest + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: pr-revision + description: the commit/revision in the source branch of the PullRequest that is to be built + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: PR_REVISION + value: $(params.pr-revision) + steps: + - name: test-run-basic-rhel-pr + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + env: + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_pr_rhel_suite diff --git a/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-region.yaml b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-region.yaml new file mode 100644 index 00000000..0c99a9e9 --- /dev/null +++ b/.tekton/lsf-da-longterm/lsf_da_task/lsf-task-region.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: wes-lsf-da-region +spec: + params: + - name: ibmcloud-api + description: the ibmcloud api + default: https://cloud.ibm.com + - name: continuous-delivery-context-secret + description: name of the secret containing the continuous delivery pipeline context secrets + default: secure-properties + - name: ibmcloud-apikey-secret-key + description: field in the secret that contains the api key used to login to ibmcloud + default: ibmcloud_api_key + - name: pipeline-debug + description: Pipeline debug mode. Value can be 0 or 1. Default to 0 + default: "0" + - name: revision + description: | + the git revision/commit to update the git HEAD to. + Default is to mean only use the branch + default: "" + - name: directory-name + default: "." + - name: repository + description: the git repo url + - name: git_access_token + description: the token to access the git repository for the clone operations + default: "" + - name: zone + default: "" + description: The IBM Cloud zone name within the selected region where the IBM Cloud HPC cluster should be deployed and requires a single zone input value. Supported zones are eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli). + - name: resource_group + description: Resource group name from your IBM Cloud account where the VPC resources should be deployed. Note. If the resource group value is set as null, automation creates two different RG with the name (workload-rg and service-rg). For additional information on resource groups, see [Managing resource groups](https://cloud.ibm.com/docs/account?topic=account-rgs). + default: Default + - name: compute_image_name_rhel + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster dynamic compute nodes. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v1). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: login_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster login node. By default, the solution uses a RHEL 8-6 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/hpc-spectrum-LSF#create-custom-image). The solution also offers, Ubuntu 22-04 OS base image (hpcaas-lsf10-ubuntu2204-compute-v2). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: cos_region + description: The cos region name. + default: "" + - name: cos_bucket + description: The cos bucket name. + default: "" + - name: cos_instance_crn + description: The cos instance crn. + default: "" + - name: cos_api_key + description: The cos account api key. + default: "" + - name: hpc_custom_reports_repo + description: The HPC custom reports storage repository. + default: "" + - name: hpc_custom_reports_branch + description: The HPC custom reports storage repository branch. + default: "main" + - name: git_user_name + description: The git user name. + default: "" + - name: git_user_email + description: The git user email. + default: "" + - name: management_image_name + description: Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud HPC cluster management nodes. By default, the solution uses a RHEL88 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the IBM Cloud HPC cluster through this offering. + default: "" + - name: deployer_image_name + description: Name of the deployer image. + default: "" + - name: solution + description: Provide the value for the solution that is needed for the support of lsf and HPC. + default: "lsf" + workspaces: + - name: workspace + mountPath: /artifacts + stepTemplate: + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: $(params.continuous-delivery-context-secret) + key: $(params.ibmcloud-apikey-secret-key) + optional: true + - name: BUILD_NUMBER + valueFrom: + fieldRef: + fieldPath: metadata.annotations['devops.cloud.ibm.com/build-number'] + - name: PIPELINE_DEBUG + value: $(params.pipeline-debug) + - name: REVISION + value: $(params.revision) + - name: zone + value: $(params.zone) + - name: resource_group + value: $(params.resource_group) + - name: compute_image_name_rhel + value: $(params.compute_image_name_rhel) + - name: login_image_name + value: $(params.login_image_name) + - name: cos_region + value: $(params.cos_region) + - name: cos_bucket + value: $(params.cos_bucket) + - name: cos_instance_crn + value: $(params.cos_instance_crn) + - name: cos_api_key + value: $(params.cos_api_key) + - name: hpc_custom_reports_repo + value: $(params.hpc_custom_reports_repo) + - name: hpc_custom_reports_branch + value: $(params.hpc_custom_reports_branch) + - name: git_user_name + value: $(params.git_user_name) + - name: git_user_email + value: $(params.git_user_email) + - name: git_access_token + value: $(params.git_access_token) + - name: solution + value: $(params.solution) + - name: management_image_name + value: $(params.management_image_name) + - name: deployer_image_name + value: $(params.deployer_image_name) + steps: + - name: regions-scenario + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_regions_suite + - name: rhel-suite-10 + onError: continue + image: icr.io/continuous-delivery/pipeline/pipeline-base-ubi:latest + workingDir: "/artifacts" + imagePullPolicy: Always + command: ["/bin/bash", "-c"] + args: + - | + #!/bin/bash + + if [[ "${PIPELINE_DEBUG}" == "true" ]]; then + pwd + env + trap env EXIT + set -x + fi + + source .tekton/scripts/suites.sh + lsf_da_rhel_suite_10 diff --git a/.tekton/scripts/common_utils.sh b/.tekton/scripts/common_utils.sh index 2352a61e..60d51945 100644 --- a/.tekton/scripts/common_utils.sh +++ b/.tekton/scripts/common_utils.sh @@ -19,6 +19,14 @@ get_commit_ssh_key() { CICD_SSH_KEY=$CICD_SSH_KEY-tekton fi fi + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + CICD_SSH_KEY=cicd-lsf-da + if [ "${REVISION}" ]; then + CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$REVISION") + else + CICD_SSH_KEY=$CICD_SSH_KEY-tekton + fi + fi } @@ -33,6 +41,10 @@ get_pr_ssh_key() { CICD_SSH_KEY=cicd-lsf CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$PR_REVISION") fi + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + CICD_SSH_KEY=cicd-lsf + CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$PR_REVISION") + fi } git clone --depth=1 https://github.com/tfutils/tfenv.git ~/.tfenv diff --git a/.tekton/scripts/cos_upload.sh b/.tekton/scripts/cos_upload.sh index 84dba5c6..1a96deae 100644 --- a/.tekton/scripts/cos_upload.sh +++ b/.tekton/scripts/cos_upload.sh @@ -18,7 +18,7 @@ cos_upload() { COMMIT_MESSAGE="manual" fi - ls -ltr "$DIRECTORY"/logs + ls -ltr "$DIRECTORY" echo "***********INSTALL IBM-COS-SDK *************" python3 -m pip install --pre --upgrade ibm-cos-sdk==2.0.1 --quiet @@ -26,14 +26,20 @@ cos_upload() { if [[ "$CHECK_SOLUTION" == "hpcaas" ]]; then if [[ "$CHECK_PR" == "REGRESSION" ]]; then - python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$DIRECTORY"/logs/"$VALIDATION_LOG_FILE_NAME" "$COS_FOLDER"/HPCAAS/VALIDATION_LOG/"$COMMIT_MESSAGE"/"$VALIDATION_LOG_FILE" + python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$DIRECTORY"/logs_output/"$VALIDATION_LOG_FILE_NAME" "$COS_FOLDER"/HPCAAS/VALIDATION_LOG/"$COMMIT_MESSAGE"/"$VALIDATION_LOG_FILE" fi python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$LOG_FILE_NAME" "$COS_FOLDER"/HPCAAS/INFRA_LOG/"$COMMIT_MESSAGE"/"$LOG_FILE_NAME"-"$CURRENT_DATE_FILE".log fi if [[ "$CHECK_SOLUTION" == "lsf" ]]; then if [[ "$CHECK_PR" == "REGRESSION" ]]; then - python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$DIRECTORY"/logs/"$VALIDATION_LOG_FILE_NAME" "$COS_FOLDER"/LSF/VALIDATION_LOG/"$COMMIT_MESSAGE"/"$VALIDATION_LOG_FILE" + python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$DIRECTORY"/logs_output/"$VALIDATION_LOG_FILE_NAME" "$COS_FOLDER"/LSF/VALIDATION_LOG/"$COMMIT_MESSAGE"/"$VALIDATION_LOG_FILE" fi python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$LOG_FILE_NAME" "$COS_FOLDER"/LSF/INFRA_LOG/"$COMMIT_MESSAGE"/"$LOG_FILE_NAME"-"$CURRENT_DATE_FILE".log fi + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + if [[ "$CHECK_PR" == "REGRESSION" ]]; then + python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$DIRECTORY"/logs_output/"$VALIDATION_LOG_FILE_NAME" "$COS_FOLDER"/LSF-DA/VALIDATION_LOG/"$COMMIT_MESSAGE"/"$VALIDATION_LOG_FILE" + fi + python3 /artifacts/.tekton/scripts/cos_data.py UPLOAD "$LOG_FILE_NAME" "$COS_FOLDER"/LSF-DA/INFRA_LOG/"$COMMIT_MESSAGE"/"$LOG_FILE_NAME"-"$CURRENT_DATE_FILE".log + fi } diff --git a/.tekton/scripts/issue_track.sh b/.tekton/scripts/issue_track.sh index 1db43446..4ed0ae41 100644 --- a/.tekton/scripts/issue_track.sh +++ b/.tekton/scripts/issue_track.sh @@ -45,7 +45,7 @@ issue_track() { DIRECTORY="/artifacts/tests" if [ -d "$DIRECTORY" ]; then if [[ "${LOG_FILE_NAME}" == *"negative"* ]]; then - negative_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/"$LOG_FILE_NAME" | grep 'FAIL') + negative_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/lsf_tests/"$LOG_FILE_NAME" | grep 'FAIL') if [[ "$negative_log_error_check" ]]; then echo "${negative_log_error_check}" echo "Found FAIL in plan/apply log. Please check log : ${LOG_FILE_NAME}" @@ -53,7 +53,7 @@ issue_track() { fi else # Track error/fail from the suites log file - log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/"$LOG_FILE_NAME" | grep -E -w 'FAIL|Error|ERROR') + log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/lsf_tests/"$LOG_FILE_NAME" | grep -E -w 'FAIL|Error|ERROR') if [[ "$log_error_check" ]]; then echo "${log_error_check}" echo "Found Error/FAIL/ERROR in plan/apply log. Please check log : ${LOG_FILE_NAME}" @@ -64,15 +64,15 @@ issue_track() { if [[ "${CHECK_PR_OR_TASK}" != "PR" ]]; then VALIDATION_LOG_FILE=$(echo "$LOG_FILE_NAME" | cut -f 1 -d '.').log # Track test_output log file initiated or not - test_output_file_check=$(find $DIRECTORY/logs/"$VALIDATION_LOG_FILE" 2>/dev/null) + test_output_file_check=$(find $DIRECTORY/logs_output/"$VALIDATION_LOG_FILE" 2>/dev/null) if [[ -z "$test_output_file_check" ]]; then - echo "Validation log file not initiated under ${DIRECTORY/logs/}" + echo "Validation log file not initiated under ${DIRECTORY/logs_output/}" exit 1 fi fi # Track suites log file initiated or not - log_file_check=$(find $DIRECTORY/*.json 2>/dev/null) + log_file_check=$(find $DIRECTORY/lsf_tests/*.json 2>/dev/null) if [[ -z "$log_file_check" ]]; then echo "Infra log not initiated under ${DIRECTORY}" exit 1 @@ -88,13 +88,13 @@ display_validation_log() { DIRECTORY="/artifacts/tests" if [ -d "$DIRECTORY" ]; then # Display test_output log file - validation_log_file_check=$(find $DIRECTORY/logs/"$LOG_FILE_NAME" 2>/dev/null) + validation_log_file_check=$(find $DIRECTORY/logs_output/"$LOG_FILE_NAME" 2>/dev/null) if [[ -z "$validation_log_file_check" ]]; then echo "Test output log file not initiated." exit 1 else echo "********************** DISPLAY ${LOG_FILE_NAME} VALIDATION OUTPUT LOG ********************" - cat $DIRECTORY/logs/"$LOG_FILE_NAME" + cat $DIRECTORY/logs_output/"$LOG_FILE_NAME" echo "********************** DISPLAY ${LOG_FILE_NAME} VALIDATION OUTPUT LOG **********************" echo "##################################################################################" @@ -103,9 +103,9 @@ display_validation_log() { echo "##################################################################################" echo "##################################################################################" if [[ "${LOG_FILE_NAME}" == *"negative"* ]]; then - validation_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/logs/"$LOG_FILE_NAME" | grep -E -w 'FAIL') + validation_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/logs_output/"$LOG_FILE_NAME" | grep -E -w 'FAIL') else - validation_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/logs/"$LOG_FILE_NAME" | grep -E -w 'FAIL|Error|ERROR') + validation_log_error_check=$(grep -v -e 'Terraform upgrade output:' -e 'Error retrieving reservation ID from secrets:' -e 'Field validation for' $DIRECTORY/logs_output/"$LOG_FILE_NAME" | grep -E -w 'FAIL|Error|ERROR') fi # Display if any error in validation log @@ -115,7 +115,7 @@ display_validation_log() { echo "********************** ERROR CHECK in ${LOG_FILE_NAME} VALIDATION OUTPUT LOG **********************" exit 1 else - echo "No Error found in $DIRECTORY/logs/$LOG_FILE_NAME" + echo "No Error found in $DIRECTORY/logs_output/$LOG_FILE_NAME" fi fi else diff --git a/.tekton/scripts/push_reports.sh b/.tekton/scripts/push_reports.sh index 05827f63..1a45ccca 100644 --- a/.tekton/scripts/push_reports.sh +++ b/.tekton/scripts/push_reports.sh @@ -28,6 +28,9 @@ push_reports() { if [[ "$CHECK_SOLUTION" == "lsf" ]]; then folder_name="lsf/${time_stamp}/$PR_OR_REGRESSION/${BUILD_NUMBER}" fi + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + folder_name="lsf-da/${time_stamp}/$PR_OR_REGRESSION/${BUILD_NUMBER}" + fi mkdir -p "${folder_name}" git pull origin "${hpc_custom_reports_branch:?}" cp "$DIRECTORY"/"${HTML_FILE_NAME}".html "$DIRECTORY"/push_reports/"${suite}"/"${folder_name}" diff --git a/.tekton/scripts/ssh_create_delete.sh b/.tekton/scripts/ssh_create_delete.sh index 506a0679..dea78aab 100644 --- a/.tekton/scripts/ssh_create_delete.sh +++ b/.tekton/scripts/ssh_create_delete.sh @@ -23,6 +23,17 @@ set_ssh_key_name() { CICD_SSH_KEY=$CICD_SSH_KEY-tekton fi fi + + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + CICD_SSH_KEY=cicd-lsf-da + if [ -z "${PR_REVISION}" ] && [ "${REVISION}" ]; then + CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$REVISION") + elif [ "${PR_REVISION}" ] && [ -z "${REVISION}" ]; then + CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$PR_REVISION") + else + CICD_SSH_KEY=$CICD_SSH_KEY-tekton + fi + fi } ssh_key_create() { diff --git a/.tekton/scripts/suites.sh b/.tekton/scripts/suites.sh index fdbbff69..9191b5d8 100644 --- a/.tekton/scripts/suites.sh +++ b/.tekton/scripts/suites.sh @@ -10,8 +10,9 @@ common_suite() { source "$file" done export TF_VAR_ibmcloud_api_key=$API_KEY + export TF_VAR_github_token=${git_access_token:?} - DIRECTORY="/artifacts/tests" + DIRECTORY="/artifacts/tests/lsf_tests" if [ -d "$DIRECTORY" ]; then cd $DIRECTORY || exit test_cases="${test_cases//,/|}" @@ -23,7 +24,7 @@ common_suite() { if [[ "$CHECK_SOLUTION" == "hpcaas" ]]; then # get ssh-key created based on pr-id get_pr_ssh_key "${PR_REVISION}" "${CHECK_SOLUTION}" - SSH_KEY=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ + SSH_KEYS=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ ZONE=${zone:?} RESERVATION_ID=${reservation_id:?} CLUSTER_NAME=${cluster_name:?} DEFAULT_EXISTING_RESOURCE_GROUP=${resource_group:?} \ go test -v -timeout 9000m -run "${test_cases}" | tee -a "$LOG_FILE" # Upload log/test_output files to cos bucket @@ -39,7 +40,7 @@ common_suite() { if [[ "$CHECK_SOLUTION" == "lsf" ]]; then # get ssh-key created based on pr-id get_pr_ssh_key "${PR_REVISION}" "${CHECK_SOLUTION}" - SSH_KEY=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ + SSH_KEYS=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ ZONE=${zone:?} SOLUTION=${solution:?} DEFAULT_EXISTING_RESOURCE_GROUP=${resource_group:?} \ go test -v -timeout 9000m -run "${test_cases}" | tee -a "$LOG_FILE" # Upload log/test_output files to cos bucket @@ -52,11 +53,25 @@ common_suite() { issue_track "${LOG_FILE}" "PR" fi + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + # get ssh-key created based on pr-id + get_pr_ssh_key "${PR_REVISION}" "${CHECK_SOLUTION}" + SSH_KEYS=${CICD_SSH_KEY:?} go test -v -timeout=900m -parallel=10 -run="${test_cases}" | tee -a "$LOG_FILE_NAME" + # Upload log/test_output files to cos bucket + cos_upload "PR" "${CHECK_SOLUTION}" "${DIRECTORY}" + + # push custom reports to custom-reports repository + push_reports "${LOG_FILE}" "${DIRECTORY}" "PR" "${suite}" "${CHECK_SOLUTION}" "${BUILD_NUMBER}" + + # Checking any error/issue from log file for pr + issue_track "${LOG_FILE}" "PR" + fi + else if [[ "$CHECK_SOLUTION" == "hpcaas" ]]; then # get ssh-key created based on commit-id get_commit_ssh_key "${REVISION}" "${CHECK_SOLUTION}" - SSH_KEY=${CICD_SSH_KEY:?} US_EAST_ZONE=${us_east_zone:?} US_EAST_CLUSTER_ID=${us_east_cluster_id:?} \ + SSH_KEYS=${CICD_SSH_KEY:?} US_EAST_ZONE=${us_east_zone:?} US_EAST_CLUSTER_ID=${us_east_cluster_id:?} \ US_EAST_RESERVATION_ID=${us_east_reservation_id:?} US_SOUTH_ZONE=${us_south_zone:?} \ US_SOUTH_CLUSTER_ID=${us_south_cluster_id:?} US_SOUTH_RESERVATION_ID=${us_south_reservation_id:?} \ EU_DE_ZONE=${eu_de_zone:?} EU_DE_CLUSTER_ID=${eu_de_cluster_id:?} EU_DE_RESERVATION_ID=${eu_de_reservation_id:?} \ @@ -77,7 +92,7 @@ common_suite() { if [[ "$CHECK_SOLUTION" == "lsf" ]]; then # get ssh-key created based on commit-id get_commit_ssh_key "${REVISION}" "${CHECK_SOLUTION}" - SSH_KEY=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ + SSH_KEYS=${CICD_SSH_KEY:?} COMPUTE_IMAGE_NAME=${compute_image_name:?} LOGIN_NODE_IMAGE_NAME=${login_image_name:?} MANAGEMENT_IMAGE_NAME=${management_image_name:?} \ ZONE=${zone:?} SOLUTION=${solution:?} DEFAULT_EXISTING_RESOURCE_GROUP=${resource_group:?} \ go test -v -timeout 9000m -run "${test_cases}" | tee -a "$LOG_FILE" # Upload log/test_output files to cos bucket @@ -89,6 +104,20 @@ common_suite() { # Checking any error/issue from log file for commit/push issue_track "${LOG_FILE}" fi + + if [[ "$CHECK_SOLUTION" == "lsf-da" ]]; then + # get ssh-key created based on commit-id + get_commit_ssh_key "${REVISION}" "${CHECK_SOLUTION}" + SSH_KEYS=${CICD_SSH_KEY:?} go test -v -timeout=900m -parallel=10 -run="${test_cases}" | tee -a "$LOG_FILE_NAME" + # Upload log/test_output files to cos bucket + cos_upload "REGRESSION" "${CHECK_SOLUTION}" "${DIRECTORY}" "${VALIDATION_LOG_FILE_NAME}" + + # push custom reports to custom-reports repository + push_reports "${LOG_FILE}" "${DIRECTORY}" "REGRESSION" "${suite}" "${CHECK_SOLUTION}" "${BUILD_NUMBER}" + + # Checking any error/issue from log file for commit/push + issue_track "${LOG_FILE}" + fi fi else pwd @@ -423,4 +452,104 @@ lsf_negative_suite_5() { common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:?}" "${solution:?}" } -######################## HPCaaS Testcases End ######################## +######################## LSF Testcases End ######################## + +######################## LSF-DA-LONGTERM Testcases Start ######################## +# pr based suite on rhel +lsf_da_pr_rhel_suite() { + suite=lsf-da-pr-rhel-suite + solution=lsf-da + test_cases="TestRunDefault" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" "PR" +} + +# commit based suite on rhel-suite-1 +lsf_da_rhel_suite_1() { + suite=lsf-da-rhel-suite-1 + solution=lsf-da + test_cases="TestRunBasic,TestRunCustomRGAsNull" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-2 +lsf_da_rhel_suite_2() { + suite=lsf-da-rhel-suite-2 + solution=lsf-da + test_cases="TestRunCustomRGAsNonDefault,TestRunNoKMSAndHTOff" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-3 +lsf_da_rhel_suite_3() { + suite=lsf-da-rhel-suite-3 + solution=lsf-da + test_cases="TestRunUsingExistingKMS,TestRunUsingExistingKMSInstanceIDAndWithoutKey" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-4 +lsf_da_rhel_suite_4() { + suite=lsf-da-rhel-suite-4 + solution=lsf-da + test_cases="TestRunWithExistingKMSInstanceAndKeyWithAuthorizationPolicy,TestRunLSFClusterCreationWithZeroWorkerNodes" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-5 +lsf_da_rhel_suite_5() { + suite=lsf-da-rhel-suite-5 + solution=lsf-da + test_cases="TestRunLDAP,TestRunCosAndVpcFlowLogs" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-6 +lsf_da_rhel_suite_6() { + suite=lsf-da-rhel-suite-6 + solution=lsf-da + test_cases="TestObservabilityAllFeaturesDisabled,TestObservabilityLogsEnabledForManagementAndCompute" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-7 +lsf_da_rhel_suite_7() { + suite=lsf-da-rhel-suite-7 + solution=lsf-da + test_cases="TestObservabilityMonitoringEnabledForManagementAndCompute,TestObservabilityAtrackerScenarios" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} + +# commit based suite on rhel-suite-8 +lsf_da_rhel_suite_8() { + suite=lsf-da-rhel-suite-8 + solution=lsf-da + test_cases="TestRunCIDRsAsNonDefault,TestRunMultiProfileStaticAndDynamic" + compute_image_name_rhel="" + new_line="${test_cases//,/$'\n'}" + echo "************** Going to run ${suite} ${new_line} **************" + common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:-}" "${solution:?}" +} diff --git a/DA_LSF_BYOL_Arch.drawio.svg b/DA_LSF_BYOL_Arch.drawio.svg deleted file mode 100644 index 8f2fdb7e..00000000 --- a/DA_LSF_BYOL_Arch.drawio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
IBM Cloud
IBM Cloud
Region
Region
Availability Zone
Availability Zone
Public
Network

Public...
Internet
Int...
User
Use...
Cloud Services
Cloud Services
DNS Service
DNS...
VPC Flow Logs
(Optional)
VPC...
ICD MySQL
(Optional)
ICD...
COS
(Optional)
COS...
Key Protect
(Optional)
Key...
Secrets Manager
(Optional)
Sec...
IBM Cloud Monitoring
(Optional)
IBM...
IBM Cloud Logs
(Optional)
IBM...
Security and Compliance Center
(Optional)
Sec...
Event Notification Service
(Optional)
Eve...
VPC
HPC
VPC...
SSH
SSH
VPN Gateway (optional)
VPN...
VPC API Endpoint
VPC...
Subnet
Login
Subnet...
Floating IP
Flo...
Public Gateway
Pub...
Subnet
HPC
Subnet...
Login SG
Login SG
File Storage
Fil...
Bastion Node
Bas...
Login Node
Log...
IBM Storage Scale
(Optional)
IBM...



 LSF Management Nodes - v10.1.014 

LSF Management Nodes - v...
LDAP Server
LDA...
HPC SG
HPC SG
Virtual Server
Static Compute Nodes
Virtual Server...


Virtual Server
Dynamic Compute Nodes
Virtual Server...






Text is not SVG - cannot display
\ No newline at end of file diff --git a/LSF_DA_New.drawio.svg b/LSF_DA_New.drawio.svg new file mode 100644 index 00000000..0baeb90b --- /dev/null +++ b/LSF_DA_New.drawio.svg @@ -0,0 +1,4 @@ + + + +Virtual ServerDynamic Compute nodesSubnet LoginPublicNetworkInternetUser
SSH
Floating IPVPN Gateway(optional)VPCHPCRegionAvailability ZoneIBM Cloud

LDAP
sever
Public GatewayIBM StorageScale(Optional)VPCAPIEndpointFile StorageSubnetHPC                      LSF Management Nodes -                  v10.1.0.14/                 v10.1.0.15Virtual Server Static Compute nodesHPC SGLogin SGBastion nodeLogin nodeDeployer nodeCloud ServicesKey Protect(Optional)SecretsManager(Optional)IBM Cloud Logs(Optional)COS(Optional)IBM CloudMonitoring(Optional)DNS ServicesSecurity and Compliance Center(Optional)VPCFlowLogs(Optional)
\ No newline at end of file diff --git a/README.md b/README.md index 55b6b170..a6dea173 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# IBM Spectrum LSF -Repository for the IBM Spectrum LSF project with IBM Spectrum schedulers +# IBM Cloud HPC +Repository for the IBM Cloud HPC project with IBM Spectrum schedulers ## Deploying the environment using CLI: @@ -19,15 +19,15 @@ Note: IBM Catalog management plug-in must be pre-installed. [Learn more](https:/ ``` $ cp sample/configs/hpc_catalog_values.json values.json $ vim values.json -# Paste your API key and other mandatory parameters value for IBM Spectrum LSF cluster +# Paste your API key and other mandatory parameters value for IBM Cloud HPC cluster # Login to the IBM Cloud CLI $ ibmcloud catalog install --vl --override-values values.json -Note: You can retrieve the by accessing the CLI section within the Deployment options of the IBM Spectrum LSF tile. +Note: You can retrieve the by accessing the CLI section within the Deployment options of the IBM Cloud HPC tile. It bears resemblance to something along these lines: $ ibmcloud catalog install --vl 1082e7d2-5e2f-0a11-a3bc-f88a8e1931fc.c7645085-5f49-4d5f-8786-45ac376e60fe-global --override-values values.json -Attempting install of IBM Spectrum LSF version x.x.x... +Attempting install of IBM Cloud HPC version x.x.x... Schematics workspace: https://cloud.ibm.com/schematics/workspaces/us-south.workspace.globalcatalog-collection.40b1c1e4/jobs?region= Workspace status: DRAFT Workspace status: INACTIVE @@ -46,7 +46,7 @@ You can refer the Schematics workspace url (next to Schematics workspace:) as pa ``` $ cp sample/configs/hpc_schematics_values.json values.json $ vim values.json -# Paste your API key and other mandatory parameters value for IBM Spectrum LSF cluster +# Paste your API key and other mandatory parameters value for IBM Cloud HPC cluster # Login to the IBM Cloud CLI $ ibmcloud schematics workspace new -f values.json --github-token xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx $ ibmcloud schematics workspace list @@ -65,20 +65,15 @@ OK $ ibmcloud schematics logs --id us-east.workspace.hpcc-cluster.7cbc3f6b ... - 2025/02/18 15:00:57 Terraform apply | Apply complete! Resources: 123 added, 0 changed, 0 destroyed. - 2025/02/18 15:00:57 Terraform apply | - 2025/02/18 15:00:57 Terraform apply | Outputs: - 2025/02/18 15:00:57 Terraform apply | - 2025/02/18 15:00:57 Terraform apply | image_entry_found = "true -- - hpc-lsf10-rhel810-v2" - 2025/02/18 15:00:57 Terraform apply | region_name = "us-east" - 2025/02/18 15:00:57 Terraform apply | remote_allowed_cidr = [ - 2025/02/18 15:00:57 Terraform apply | "xxxxxxxx", - 2025/02/18 15:00:57 Terraform apply | ] - 2025/02/18 15:00:57 Terraform apply | ssh_to_login_node = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/xxxhiddenxxx -J ubuntu@xxxxxxxxx lsfadmin@xxxxxxxx" - 2025/02/18 15:00:57 Terraform apply | ssh_to_management_node_1 = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/xxxhiddenxxx -J ubuntu@xxxxxxxxxxx lsfadmin@xxxxxxx" - 2025/02/18 15:00:57 Terraform apply | vpc_name = "hpc-lsf-hpc-vpc -- - r014-7119e3ed-a3b9-4256-9418-f93f94527c1d" - 2025/02/18 15:00:57 Terraform apply | worker_node_min_count = 0 - 2025/02/18 15:00:57 Command finished successfully. + 2023/06/05 22:14:29 Terraform apply | Apply complete! Resources: 41 added, 0 changed, 0 destroyed. + 2023/06/05 22:14:29 Terraform apply | + 2023/06/05 22:14:29 Terraform apply | Outputs: + 2023/06/05 22:14:29 Terraform apply | + 2023/06/05 22:14:29 Terraform apply | image_map_entry_found = "true -- - hpcaas-lsf10-rhel86-v1" + 2023/06/05 22:14:29 Terraform apply | region_name = "us-east" + 2023/06/05 22:14:29 Terraform apply | ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J vpcuser@150.239.215.145 lsfadmin@10.241.0.4" + 2023/06/05 22:14:29 Terraform apply | vpc_name = "dv-hpcaas-vpc -- - r014-e7485f03-6797-4633-b140-2822ce8e1893" + 2023/06/05 22:14:29 Command finished successfully. OK ``` @@ -86,7 +81,7 @@ OK * Connect to an LSF login node through SSH by using the `ssh_to_login_node` command from the Schematics log output. ``` -ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@ lsfadmin@ +ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J vpcuser@ lsfadmin@ ``` * where `floating_IP_address` is the floating IP address for the bastion node and `login_node_IP_address` is the IP address for the login node. @@ -96,7 +91,7 @@ ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@ lsfadmin@ +ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -L 8443:localhost:8443 -L 6080:localhost:6080 -J vpcuser@> lsfadmin@ ``` * where `floating_IP_address` is the floating IP address for the bastion node and `management_node_IP_address` is the IP address for the management node. @@ -104,11 +99,11 @@ ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveIn * To access the Application Center GUI, enter the password you configured when you created your workspace and the default user as "lsfadmin". -* If LDAP is enabled, you can access the LSF Application Center using the LDAP username and password that you configured during IBM Spectrum LSF cluster deployment or using an existing LDAP username and password. +* If LDAP is enabled, you can access the LSF Application Center using the LDAP username and password that you configured during IBM Cloud® HPC cluster deployment or using an existing LDAP username and password. * If IBM Spectrum LSF Application Center GUI is installed in High Availability. The `application_center_tunnel` command is a bit different. Then read also `application_center_url_note` line. ``` -"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -L 8443:pac.:8443 -L 6080:pac.:6080 -J ubuntu@ lsfadmin@" +"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -L 8443:pac.:8443 -L 6080:pac.:6080 -J vpcuser@ lsfadmin@" application_center_url = "https://pac.:8443" ``` @@ -116,7 +111,7 @@ application_center_url = "https://pac.:8443" * Connect to your OpenLDAP server through SSH by using the `ssh_to_ldap_node` command from the Schematics log output. ``` -ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -J ubuntu@ ubuntu@ +ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -J vpcuser@ ubuntu@ ``` * where `floating_IP_address` is the floating IP address for the bastion node and `LDAP_server_IP` is the IP address for the OpenLDAP node. @@ -135,7 +130,7 @@ ldapsearch -Q -LLL -Y EXTERNAL -H ldapi:/// * Submit a Job from HPC cluster Management node with LDAP user : Log into the management node using the `ssh_to_management_node` value as shown as part of output section of Schematics job log: ``` -ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@ lsfadmin@ +ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J vpcuser@ lsfadmin@ ``` * where `floating_IP_address` is the floating IP address for the bastion node and `management_node_IP_address` is the IP address for the management node. @@ -150,13 +145,13 @@ Password: * Submit an LSF job as the LDAP user: ``` -[lsfuser05@hpccluster-mgmt-1 lsfadmin]$ bsub -n 8 sleep 100 +[lsfuser05@hpccluster-mgmt-1 lsfadmin]$ bsub -J myjob[1-4] -R "rusage[mem=2G]" sleep 10 Job <1> is submitted to default queue . ``` ### Cleaning up the deployed environment: -If you no longer need your deployed IBM Spectrum LSF cluster, you can clean it up from your environment. The process is threefold: ensure that the cluster is free of running jobs or working compute nodes, destroy all the associated VPC resources and remove them from your IBM Cloud account, and remove the project from the IBM Cloud console. +If you no longer need your deployed IBM Cloud HPC cluster, you can clean it up from your environment. The process is threefold: ensure that the cluster is free of running jobs or working compute nodes, destroy all the associated VPC resources and remove them from your IBM Cloud account, and remove the project from the IBM Cloud console. **Note**: Ensuring that the cluster is free of running jobs and working compute nodes @@ -190,6 +185,12 @@ If the cluster has no running jobs or compute nodes, then it is safe to destroy 3. Confirm the action by entering the workspace name in the text box and click **Destroy**. You can now safely remove the resources from your account. +#### Accessing HPC Tile on Other Regions +1. If there is a requirement to create HPC cluster other than us-east/us-south/eu-de region, HPC cluster can still be provisioned on another regions. +2. Instead of using the IBMCloudHPC provider, the automation uses the IBMCloudgen2 providers to spun up the dynamic nodes. +3. Also instead of using the proxy API URL, a vpc generic API shall be used to spin up the dynamic nodes in the same account as user's. +4. When creating HPC cluster on different different regions, contract id and cluster id basically not needed. So provide a random contract id and cluster_id. + ## Requirements No requirements. diff --git a/common-dev-assets b/common-dev-assets index 2a2281ec..b76eee7e 160000 --- a/common-dev-assets +++ b/common-dev-assets @@ -1 +1 @@ -Subproject commit 2a2281eca386901262a1d0c7b617dc07476d5944 +Subproject commit b76eee7ec33d5b81b30828e37a02d2df595e0909 diff --git a/cra-config.yaml b/cra-config.yaml index cd11f19e..3e617358 100644 --- a/cra-config.yaml +++ b/cra-config.yaml @@ -1,13 +1,10 @@ # More info about this file at https://github.com/terraform-ibm-modules/common-pipeline-assets/blob/main/.github/workflows/terraform-test-pipeline.md#cra-config-yaml version: "v1" CRA_TARGETS: - - CRA_TARGET: "solutions/hpc" + - CRA_TARGET: "solutions/lsf" CRA_IGNORE_RULES_FILE: "cra-tf-validate-ignore-rules.json" PROFILE_ID: "48279384-3d29-4089-8259-8ed354774b4a" # SCC profile ID (currently set to 'CIS IBM Cloud Foundations Benchmark v1.1.0' '1.1.0' profile). CRA_ENVIRONMENT_VARIABLES: - TF_VAR_cluster_name: "HPC-LSF-1" - TF_VAR_reservation_id: "Contract-IBM-WES-DA" - TF_VAR_bastion_ssh_keys: "[\"geretain-hpc\"]" - TF_VAR_compute_ssh_keys: "[\"geretain-hpc\"]" + TF_VAR_ssh_keys: "[\"geretain-hpc\"]" TF_VAR_remote_allowed_ips: "[\"49.207.216.50\"]" - TF_VAR_solution: "lsf" + TF_VAR_app_center_gui_password: "Craconfig@123" #pragma: allowlist secret diff --git a/datasource.tf b/datasource.tf new file mode 100644 index 00000000..7953cf00 --- /dev/null +++ b/datasource.tf @@ -0,0 +1,101 @@ +# Future use +/* +data "ibm_is_region" "region" { + name = local.region +} + +data "ibm_is_zone" "zone" { + name = var.zones[0] + region = data.ibm_is_region.region.name +} +*/ + +#Fetching Existing VPC CIDR for Security rules: +data "ibm_is_vpc" "existing_vpc" { + count = var.vpc_name != null ? 1 : 0 + name = var.vpc_name +} + +data "ibm_is_vpc_address_prefixes" "existing_vpc_cidr" { + count = var.vpc_name != null ? 1 : 0 + vpc = data.ibm_is_vpc.existing_vpc[0].id +} + +/* +data "ibm_is_subnet" "subnet" { + count = length(local.subnets) + identifier = local.subnets[count.index]["id"] +} +*/ + +# data "ibm_resource_group" "existing_resource_group" { +# count = var.existing_resource_group == null ? 0 : 1 +# name = var.existing_resource_group +# } + +data "ibm_is_subnet" "existing_cluster_subnets" { + count = var.vpc_name != null && var.cluster_subnet_id != null ? 1 : 0 + identifier = var.cluster_subnet_id +} + +data "ibm_is_subnet" "existing_storage_subnets" { + count = var.vpc_name != null && var.storage_subnets != null ? 1 : 0 + name = var.storage_subnets[count.index] +} + +data "ibm_is_subnet" "existing_protocol_subnets" { + count = var.vpc_name != null && var.protocol_subnets != null ? 1 : 0 + name = var.protocol_subnets[count.index] +} + +data "ibm_is_subnet" "existing_client_subnets" { + count = var.vpc_name != null && var.client_subnets != null ? 1 : 0 + name = var.client_subnets[count.index] +} + +data "ibm_is_subnet" "existing_login_subnets" { + count = var.vpc_name != null && var.login_subnet_id != null ? 1 : 0 + identifier = var.login_subnet_id +} + +data "ibm_is_ssh_key" "ssh_keys" { + for_each = toset(var.ssh_keys) + name = each.key +} + +data "ibm_is_subnet" "compute_subnet_crn" { + count = var.vpc_name != null && var.cluster_subnet_id != null ? 1 : 0 + identifier = local.compute_subnet_id +} + +data "ibm_is_instance_profile" "compute_profile" { + name = local.compute_vsi_profile[0] +} + +data "ibm_is_instance_profile" "storage_profile" { + name = local.storage_vsi_profile[0] +} + +data "ibm_is_bare_metal_server_profile" "storage_bms_profile" { + count = var.scheduler == "Scale" ? 1 : 0 + name = local.storage_bms_profile[0] +} + +data "ibm_is_instance_profile" "management_profile" { + name = local.management_vsi_profile[0] +} + +data "ibm_is_instance_profile" "protocol_profile" { + count = local.ces_server_type == false && (local.scale_ces_enabled == true && var.colocate_protocol_instances == false) ? 1 : 0 + name = local.protocol_vsi_profile[0] +} + +data "ibm_is_subnet_reserved_ips" "protocol_subnet_reserved_ips" { + count = local.scale_ces_enabled == true ? 1 : 0 + subnet = local.protocol_subnet_id +} + +data "ibm_is_instance_profile" "afm_server_profile" { + count = local.afm_server_type == false ? 1 : 0 + name = local.afm_vsi_profile[0] +} diff --git a/examples/create_vpc/solutions/hpc/locals.tf b/examples/create_vpc/locals.tf similarity index 100% rename from examples/create_vpc/solutions/hpc/locals.tf rename to examples/create_vpc/locals.tf diff --git a/examples/create_vpc/solutions/hpc/main.tf b/examples/create_vpc/main.tf similarity index 84% rename from examples/create_vpc/solutions/hpc/main.tf rename to examples/create_vpc/main.tf index 7045346c..a9b77f08 100644 --- a/examples/create_vpc/solutions/hpc/main.tf +++ b/examples/create_vpc/main.tf @@ -1,5 +1,5 @@ module "create_vpc" { - source = "../../modules/landing_zone_vpc" + source = "./modules/landing_zone_vpc" allowed_cidr = var.remote_allowed_ips ibmcloud_api_key = var.ibmcloud_api_key ssh_keys = var.bastion_ssh_keys @@ -10,4 +10,5 @@ module "create_vpc" { bastion_subnets_cidr = var.vpc_cluster_login_private_subnets_cidr_blocks compute_subnets_cidr = var.vpc_cluster_private_subnets_cidr_blocks enable_hub = var.enable_hub + dns_zone_name = var.dns_zone_name } diff --git a/examples/create_vpc/modules/landing_zone_vpc/datasource.tf b/examples/create_vpc/modules/landing_zone_vpc/datasource.tf index 2e2a0e64..32f4c883 100644 --- a/examples/create_vpc/modules/landing_zone_vpc/datasource.tf +++ b/examples/create_vpc/modules/landing_zone_vpc/datasource.tf @@ -1,3 +1,3 @@ -data "ibm_resource_group" "itself" { +data "ibm_resource_group" "existing_resource_group" { name = var.existing_resource_group } diff --git a/examples/create_vpc/modules/landing_zone_vpc/locals.tf b/examples/create_vpc/modules/landing_zone_vpc/locals.tf index bb797af6..7c0d0ac8 100644 --- a/examples/create_vpc/modules/landing_zone_vpc/locals.tf +++ b/examples/create_vpc/modules/landing_zone_vpc/locals.tf @@ -1,6 +1,6 @@ locals { # Defined values - name = "lsf" + name = "hpc" prefix = var.prefix tags = [local.prefix, local.name] schematics_reserved_cidrs = [ @@ -16,12 +16,37 @@ locals { "169.55.82.128/27" ] - bastion_sg_variable_cidr = flatten([ - local.schematics_reserved_cidrs, - var.allowed_cidr - # var.network_cidr - ]) - resource_group_id = var.existing_resource_group != null ? data.ibm_resource_group.itself.id : "" + # Derived values + security_group_name = format("%s-sg", local.prefix) + + # Resource group calculation + # If user defined then use existing else create new + # create_resource_group = var.existing_resource_group == null ? true : false + resource_group_id = var.existing_resource_group != null ? data.ibm_resource_group.existing_resource_group.id : "" + # new_resource_groups = var.existing_resource_group == null ? [ + # { + # name = "service-rg", + # create = local.create_resource_group, + # use_prefix : false + # }, + # { + # name = "management-rg", + # create = local.create_resource_group, + # use_prefix : false + # }, + # { + # name = "workload-rg", + # create = local.create_resource_group, + # use_prefix : false + # } + # ] : [ + # { + # name = var.existing_resource_group, + # create = local.create_resource_group + # } + # ] + # For the variables looking for resource group names only (transit_gateway, key_management, atracker) + # existing_service_resource_group = var.existing_resource_group == null ? "service-rg" : var.existing_resource_group # Region and Zone calculations region = join("-", slice(split("-", var.zones[0]), 0, 2)) @@ -30,41 +55,14 @@ locals { for zone in var.zones : format("zone-%d", substr(zone, -1, -2)) ] -} - -locals { - # Subnet calculation - active_subnets = { - for zone in local.zones : zone => contains(local.active_zones, zone) ? [ - { - name = "compute-subnet-${zone}" - acl_name = "vpc-acl" - cidr = var.compute_subnets_cidr[index(local.active_zones, zone)] - public_gateway = true - }, - zone == local.active_zones[0] ? { - name = "bastion-subnet" - acl_name = "vpc-acl" - cidr = var.bastion_subnets_cidr[0] - public_gateway = false - } : null - ] : [] - } - subnets = { for zone, subnets in local.active_subnets : zone => [for each in subnets : each if each != null] } - - # Use public gateway calculation - use_public_gateways = { - for zone in local.zones : zone => contains(local.active_zones, zone) ? true : false - } -} - -locals { - # Address_Prefix calculation bastion_sg_variable_cidr_list = split(",", var.network_cidr) - address_prefixes = { - "zone-${element(split("-", var.zones[0]), 2)}" = [local.bastion_sg_variable_cidr_list[0]] - } + + bastion_sg_variable_cidr = flatten([ + local.schematics_reserved_cidrs, + var.allowed_cidr + # var.network_cidr + ]) # Security group rules bastion_security_group_rules = flatten([ @@ -98,28 +96,57 @@ locals { remote = cidr }] ]) -} -locals { - # # VPC calculation - # # If user defined then use existing else create new - # # Calculate network acl rules (can be done inplace in vpcs) + # Address Prefixes calculation + address_prefixes = { + "zone-${element(split("-", var.zones[0]), 2)}" = [local.bastion_sg_variable_cidr_list[0]] + } + + # Subnet calculation + active_subnets = { + for zone in local.zones : zone => contains(local.active_zones, zone) ? [ + { + name = "compute-subnet-${zone}" + acl_name = "vpc-acl" + cidr = var.compute_subnets_cidr[index(local.active_zones, zone)] + public_gateway = true + }, + zone == local.active_zones[0] ? { + name = "bastion-subnet" + acl_name = "vpc-acl" + cidr = var.bastion_subnets_cidr[0] + public_gateway = true + } : null + ] : [] + } + subnets = { for zone, subnets in local.active_subnets : zone => [for each in subnets : each if each != null] } + + # Use public gateway calculation + use_public_gateways = { + for zone in local.zones : zone => contains(local.active_zones, zone) ? true : false + } + + # VPC calculation + # If user defined then use existing else create new + # Calculate network acl rules (can be done inplace in vpcs) + # TODO: VPN expectation + cidrs_network_acl_rules = compact(flatten([local.schematics_reserved_cidrs, var.allowed_cidr, var.network_cidr, "161.26.0.0/16", "166.8.0.0/14", "0.0.0.0/0"])) network_acl_inbound_rules = [ - { - name = "test-1" + for cidr_index in range(length(local.cidrs_network_acl_rules)) : { + name = format("allow-inbound-%s", cidr_index + 1) action = "allow" - destination = "0.0.0.0/0" + destination = var.network_cidr direction = "inbound" - source = "0.0.0.0/0" + source = element(local.cidrs_network_acl_rules, cidr_index) } ] network_acl_outbound_rules = [ - { - name = "test-2" + for cidr_index in range(length(local.cidrs_network_acl_rules)) : { + name = format("allow-outbound-%s", cidr_index + 1) action = "allow" - destination = "0.0.0.0/0" + destination = element(local.cidrs_network_acl_rules, cidr_index) direction = "outbound" - source = "0.0.0.0/0" + source = var.network_cidr } ] network_acl_rules = flatten([local.network_acl_inbound_rules, local.network_acl_outbound_rules]) diff --git a/examples/create_vpc/modules/landing_zone_vpc/main.tf b/examples/create_vpc/modules/landing_zone_vpc/main.tf index cfa62711..e4746c01 100644 --- a/examples/create_vpc/modules/landing_zone_vpc/main.tf +++ b/examples/create_vpc/modules/landing_zone_vpc/main.tf @@ -1,15 +1,17 @@ module "create_vpc" { - source = "terraform-ibm-modules/landing-zone-vpc/ibm" - version = "7.19.0" - prefix = local.prefix - region = local.region - tags = local.tags - resource_group_id = local.resource_group_id - name = local.name - use_public_gateways = local.use_public_gateways - subnets = local.subnets - address_prefixes = local.address_prefixes - security_group_rules = local.bastion_security_group_rules - network_acls = local.network_acls - enable_hub = var.enable_hub + source = "terraform-ibm-modules/landing-zone-vpc/ibm" + version = "7.23.6" + prefix = local.prefix + region = local.region + tags = local.tags + resource_group_id = local.resource_group_id + name = local.name + use_public_gateways = local.use_public_gateways + subnets = local.subnets + address_prefixes = local.address_prefixes + security_group_rules = local.bastion_security_group_rules + network_acls = local.network_acls + enable_hub = var.enable_hub + default_security_group_name = local.security_group_name + dns_zone_name = var.dns_zone_name } diff --git a/examples/create_vpc/modules/landing_zone_vpc/variables.tf b/examples/create_vpc/modules/landing_zone_vpc/variables.tf index 954feac4..f3098023 100644 --- a/examples/create_vpc/modules/landing_zone_vpc/variables.tf +++ b/examples/create_vpc/modules/landing_zone_vpc/variables.tf @@ -65,6 +65,12 @@ variable "enable_hub" { default = false } +variable "dns_zone_name" { + description = "The name of the DNS zone to be created." + default = null + type = string +} + variable "allowed_cidr" { description = "Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster." type = list(string) diff --git a/examples/create_vpc/solutions/hpc/outputs.tf b/examples/create_vpc/outputs.tf similarity index 100% rename from examples/create_vpc/solutions/hpc/outputs.tf rename to examples/create_vpc/outputs.tf diff --git a/examples/create_vpc/solutions/hpc/variables.tf b/examples/create_vpc/variables.tf similarity index 98% rename from examples/create_vpc/solutions/hpc/variables.tf rename to examples/create_vpc/variables.tf index be6e257f..32ed67dc 100644 --- a/examples/create_vpc/solutions/hpc/variables.tf +++ b/examples/create_vpc/variables.tf @@ -124,3 +124,9 @@ variable "enable_hub" { type = bool default = false } + +variable "dns_zone_name" { + description = "The name of the DNS zone to be created." + default = null + type = string +} diff --git a/examples/create_vpc/solutions/hpc/version.tf b/examples/create_vpc/version.tf similarity index 100% rename from examples/create_vpc/solutions/hpc/version.tf rename to examples/create_vpc/version.tf diff --git a/ibm_catalog.json b/ibm_catalog.json index e9d19d11..4c101210 100644 --- a/ibm_catalog.json +++ b/ibm_catalog.json @@ -43,10 +43,10 @@ ], "flavors": [ { - "label": "Cluster with LSF v10.1.0.14", + "label": "Cluster with LSF v10.1.0.15", "name": "Cluster-with-LSF", "install_type": "fullstack", - "working_directory": "solutions/hpc", + "working_directory": "solutions/lsf", "compliance": { "authority": "scc-v3", "profiles": [ @@ -66,16 +66,26 @@ "required": true }, { - "key": "cluster_name" - }, - { - "key": "bastion_ssh_keys" + "key": "ssh_keys", + "required": true }, { - "key": "compute_ssh_keys" + "key": "remote_allowed_ips" }, { - "key": "remote_allowed_ips" + "key": "lsf_version", + "default_value": "fixpack_15", + "required": true, + "options": [ + { + "displayname": "fixpack_15", + "value": "fixpack_15" + }, + { + "displayname": "fixpack_14", + "value": "fixpack_14" + } + ] }, { "key": "zones", @@ -192,6 +202,10 @@ } ] }, + { + "key": "app_center_gui_password", + "required": true + }, { "key": "cluster_prefix" }, @@ -277,60 +291,77 @@ "key": "skip_flowlogs_s2s_auth_policy" }, { - "key": "scc_enable" + "key": "skip_kms_s2s_auth_policy" }, { - "key": "scc_profile", - "default_value": "CIS IBM Cloud Foundations Benchmark v1.1.0", + "key": "sccwp_enable" + }, + { + "key": "sccwp_service_plan", + "default_value": "free-trial", "options": [ { - "displayname": "CIS IBM Cloud Foundations Benchmark v1.1.0", - "value": "CIS IBM Cloud Foundations Benchmark v1.1.0" + "displayname": "free-trial", + "value": "free-trial" }, { - "displayname": "IBM Cloud Framework for Financial Services", - "value": "IBM Cloud Framework for Financial Services" + "displayname": "graduated-tier", + "value": "graduated-tier" } ] }, { - "key": "scc_location", - "default_value": "us-south", + "key": "cspm_enabled" + }, + { + "key": "app_config_plan", + "default_value": "basic", "options": [ { - "displayname": "us-south", - "value": "us-south" + "displayname": "basic", + "value": "basic" }, { - "displayname": "eu-de", - "value": "eu-de" + "displayname": "lite", + "value": "lite" }, { - "displayname": "ca-tor", - "value": "ca-tor" + "displayname": "standardv2", + "value": "standardv2" }, { - "displayname": "eu-es", - "value": "eu-es" + "displayname": "enterprise", + "value": "enterprise" } ] }, { - "key": "scc_event_notification_plan", - "default_value": "lite", - "options": [ - { - "displayname": "lite", - "value": "lite" - }, - { - "displayname": "standard", - "value": "standard" - } - ] + "key": "bastion_instance", + "type": "object", + "default_value": "{\n \"image\": \"ibm-ubuntu-22-04-5-minimal-amd64-3\",\n \"profile\": \"cx2-4x8\"\n}", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { - "key": "cos_expiration_days" + "key": "deployer_instance", + "type": "object", + "default_value": "{\n \"image\": \"hpc-lsf-fp15-deployer-rhel810-v1\",\n \"profile\": \"bx2-8x32\"\n}", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { "key": "vpc_cidr" @@ -345,36 +376,57 @@ "key": "vpc_name" }, { - "key": "cluster_subnet_ids" + "key": "cluster_subnet_id" }, { "key": "login_subnet_id" }, { - "key": "login_node_instance_type" - }, - { - "key": "management_node_instance_type" - }, - { - "key": "management_node_count" - }, - { - "key": "management_image_name" - }, - { - "key": "compute_image_name" + "key": "login_instance", + "type": "array", + "default_value": "[\n {\n \"profile\": \"bx2-2x8\",\n \"image\": \"hpc-lsf-fp15-compute-rhel810-v1\"\n }\n]", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { - "key": "login_image_name" + "key": "management_instances", + "type": "array", + "default_value": "[\n {\n \"count\": 2,\n \"profile\": \"bx2-16x64\",\n \"image\": \"hpc-lsf-fp15-rhel810-v1\"\n }\n]", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { - "key": "worker_node_max_count" + "key": "static_compute_instances", + "type": "array", + "default_value": "[\n {\n \"count\": 0,\n \"profile\": \"bx2-16x64\",\n \"image\": \"hpc-lsf-fp15-compute-rhel810-v1\"\n }\n]", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { - "key": "worker_node_instance_type", + "key": "dynamic_compute_instances", "type": "array", - "default_value": "[\n {\n \"count\": 0,\n \"instance_type\": \"bx2-4x16\"\n },\n {\n \"count\": 0,\n \"instance_type\": \"cx2-8x16\"\n }\n]", + "default_value": "[\n {\n \"count\": 500,\n \"profile\": \"bx2-16x64\",\n \"image\": \"hpc-lsf-fp15-compute-rhel810-v1\"\n }\n]", "required": false, "custom_config": { "type": "json_editor", @@ -389,19 +441,7 @@ "key": "enable_dedicated_host" }, { - "key": "hyperthreading_enabled" - }, - { - "key": "enable_app_center" - }, - { - "key": "app_center_gui_pwd" - }, - { - "key": "app_center_high_availability" - }, - { - "key": "app_center_existing_certificate_instance" + "key": "enable_hyperthreading" }, { "key": "custom_file_shares", @@ -472,17 +512,22 @@ "key": "ldap_user_password" }, { - "key": "ldap_vsi_profile" - }, - { - "key": "ldap_vsi_osimage_name" + "key": "ldap_instance", + "type": "array", + "default_value": "[\n {\n \"profile\": \"cx2-2x4\",\n \"image\": \"ibm-ubuntu-22-04-5-minimal-amd64-3\"\n }\n]", + "required": false, + "custom_config": { + "type": "json_editor", + "grouping": "deployment", + "original_grouping": "deployment", + "config_constraints": { + "type": "mixed" + } + } }, { "key": "ldap_server_cert" }, - { - "key": "enable_fip" - }, { "key": "existing_bastion_instance_name" }, @@ -504,23 +549,19 @@ }, { "hidden": true, - "key": "solution" - }, - { - "hidden": true, - "key": "reservation_id" + "key": "TF_VERSION" }, { "hidden": true, - "key": "TF_VERSION" + "key": "TF_PARALLELISM" }, { "hidden": true, - "key": "TF_PARALLELISM" + "key": "override" }, { "hidden": true, - "key": "TF_VALIDATION_SCRIPT_FILES" + "key": "override_json_string" } ], "iam_permissions": [ @@ -546,9 +587,8 @@ ] }, { - "service_name": "compliance", + "service_name": "sysdig-secure", "role_crns": [ - "crn:v1:bluemix:public:iam::::serviceRole:Manager", "crn:v1:bluemix:public:iam::::role:Administrator" ] }, @@ -565,6 +605,13 @@ "crn:v1:bluemix:public:iam::::role:Editor" ] }, + { + "service_name": "service", + "role_crns": [ + "crn:v1:bluemix:public:iam::::serviceRole:Manager", + "crn:v1:bluemix:public:iam::::role:Administrator" + ] + }, { "service_name": "iam-identity", "role_crns": [ @@ -572,9 +619,10 @@ ] }, { - "service_name": "databases-for-mysql", + "service_name": "apprapp", "role_crns": [ - "crn:v1:bluemix:public:iam::::role:Editor" + "crn:v1:bluemix:public:iam::::serviceRole:Manager", + "crn:v1:bluemix:public:iam::::role:Administrator" ] }, { @@ -598,6 +646,7 @@ } ], "architecture": { + "descriptions": "", "features": [ { "title": "Separate VPC for HPC workloads", @@ -636,7 +685,7 @@ { "diagram": { "caption": "IBM Spectrum LSF", - "url": "https://raw.githubusercontent.com/terraform-ibm-modules/terraform-ibm-hpc/main/DA_LSF_BYOL_Arch.drawio.svg", + "url": "https://raw.githubusercontent.com/terraform-ibm-modules/terraform-ibm-hpc/main/LSF_DA_New.drawio.svg", "type": "image/svg+xml" }, "description": "This deployable architecture sets up a VPC on IBM Cloud to run HPC workloads within a single zone. A login node is deployed in a dedicated subnet and security group to facilitate secure access to the HPC environment. The HPC management nodes and static compute nodes reside in a separate subnet and security group.\nClusters of virtual server instances are provisioned with the IBM Spectrum LSF scheduler pre-installed for HPC workload job management. The LSF scheduler dynamically provisions compute nodes as needed and removes them once jobs are completed.\nThe solution supports either IBM Cloud File Storage for VPC or a dedicated clustered shared file system using IBM Storage Scale which is a high performance, highly available, clustered file system with advanced features like File Audit Logging for security and Active File Management for hybrid cloud connectivity. IBM Storage Scale provides more performance and scalability than standard file storage solutions." diff --git a/locals.tf b/locals.tf new file mode 100644 index 00000000..ce3bc585 --- /dev/null +++ b/locals.tf @@ -0,0 +1,581 @@ +# locals needed for landing_zone +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) + + # SSH key calculations + # Combining the common ssh keys with host specific ssh keys + gklm_instance_key_pair = distinct(concat(coalesce(var.gklm_instance_key_pair, []), coalesce(var.ssh_keys, []))) + ldap_instance_key_pair = distinct(concat(coalesce(var.ldap_instance_key_pair, []), coalesce(var.ssh_keys, []))) + ssh_keys = distinct(coalesce(var.ssh_keys, [])) + key_management = var.key_management == "null" ? null : var.key_management + ldap_server = var.ldap_server == null ? "null" : var.ldap_server + ldap_admin_password = var.ldap_admin_password == null ? "" : var.ldap_admin_password + ldap_server_cert = var.ldap_server_cert == null ? "null" : var.ldap_server_cert +} + +# locals needed for deployer +locals { + # dependency: landing_zone -> deployer + vpc_id = var.vpc_name == null ? one(module.landing_zone.vpc_id) : data.ibm_is_vpc.existing_vpc[0].id + vpc_name = var.vpc_name == null ? one(module.landing_zone.vpc_name) : var.vpc_name + kms_encryption_enabled = local.key_management != null ? true : false + boot_volume_encryption_key = local.key_management != null && var.enable_deployer ? one(module.landing_zone.boot_volume_encryption_key)["crn"] : null + existing_kms_instance_guid = local.key_management != null ? module.landing_zone.key_management_guid : null + cos_data = module.landing_zone.cos_buckets_data + # Future use + # When we implement the existing bastion concept we need the changes to implemented like below. Which is already there on our LSF DA + # skip_iam_authorization_policy = true + # skip_iam_authorization_policy = var.bastion_instance_name != null ? false : local.skip_iam_authorization_policy + # Cluster node details: + compute_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].management_vsi_data, module.landing_zone_vsi[0].compute_vsi_data]) + comp_mgmt_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].compute_management_vsi_data]) + storage_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].storage_vsi_data]) + storage_servers = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].storage_bms_data]) + protocol_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].protocol_vsi_data]) + gklm_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].gklm_vsi_data]) + client_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].client_vsi_data]) + afm_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].afm_vsi_data]) + ldap_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].ldap_vsi_data]) + tie_brkr_instances = var.enable_deployer ? [] : flatten(module.landing_zone_vsi[0].storage_cluster_tie_breaker_vsi_data) + strg_mgmt_instances = var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].storage_cluster_management_vsi]) + login_instance = var.enable_deployer ? [] : flatten(module.landing_zone_vsi[0].login_vsi_data) + + management_instance_count = sum(var.management_instances[*]["count"]) + storage_instance_count = var.storage_type == "persistent" ? sum(var.storage_servers[*]["count"]) : sum(var.storage_instances[*]["count"]) + client_instance_count = sum(var.client_instances[*]["count"]) + protocol_instance_count = sum(var.protocol_instances[*]["count"]) + static_compute_instance_count = sum(var.static_compute_instances[*]["count"]) + # afm_instance_count = sum(var.afm_instances[*]["count"]) +} + +# locals needed for landing_zone_vsi +locals { + # dependency: landing_zone -> deployer -> landing_zone_vsi + bastion_security_group_id = module.deployer.bastion_security_group_id + bastion_public_key_content = module.deployer.bastion_public_key_content + bastion_private_key_content = module.deployer.bastion_private_key_content + + deployer_hostname = var.enable_deployer ? flatten(module.deployer.deployer_vsi_data[*].list)[0].name : "" + deployer_ip = module.deployer.deployer_ip + + # Existing subnets details + existing_cluster_subnets = [ + for subnet in data.ibm_is_subnet.existing_cluster_subnets : + { + cidr = subnet.ipv4_cidr_block + id = subnet.id + name = subnet.name + zone = subnet.zone + } + ] + + existing_storage_subnets = [ + for subnet in data.ibm_is_subnet.existing_storage_subnets : + { + cidr = subnet.ipv4_cidr_block + id = subnet.id + name = subnet.name + zone = subnet.zone + } + ] + + existing_protocol_subnets = [ + for subnet in data.ibm_is_subnet.existing_protocol_subnets : + { + cidr = subnet.ipv4_cidr_block + id = subnet.id + name = subnet.name + zone = subnet.zone + } + ] + + existing_client_subnets = [ + for subnet in data.ibm_is_subnet.existing_client_subnets : + { + cidr = subnet.ipv4_cidr_block + id = subnet.id + name = subnet.name + zone = subnet.zone + } + ] + + existing_login_subnets = [ + for subnet in data.ibm_is_subnet.existing_login_subnets : + { + cidr = subnet.ipv4_cidr_block + id = subnet.id + name = subnet.name + zone = subnet.zone + } + ] + + # dependency: landing_zone -> landing_zone_vsi + client_subnets = var.vpc_name != null && var.client_subnets != null ? local.existing_client_subnets : module.landing_zone.client_subnets + cluster_subnets = var.vpc_name != null && var.cluster_subnet_id != null ? local.existing_cluster_subnets : module.landing_zone.compute_subnets + storage_subnets = var.vpc_name != null && var.storage_subnets != null ? local.existing_storage_subnets : module.landing_zone.storage_subnets + protocol_subnets = var.vpc_name != null && var.protocol_subnets != null ? local.existing_protocol_subnets : module.landing_zone.protocol_subnets + login_subnets = var.vpc_name != null && var.login_subnet_id != null ? local.existing_login_subnets : module.landing_zone.bastion_subnets + + storage_subnet = [for subnet in local.storage_subnets : subnet.name] + protocol_subnet = [for subnet in local.protocol_subnets : subnet.name] + protocol_subnet_id = local.protocol_instance_count > 0 ? [for subnet in local.protocol_subnets : subnet.id][0] : "" + cluster_subnet = [for subnet in local.cluster_subnets : subnet.id][0] + client_subnet = [for subnet in local.client_subnets : subnet.name] + login_subnet = [for subnet in local.login_subnets : subnet.id][0] + + #boot_volume_encryption_key = local.key_management != null ? one(module.landing_zone.boot_volume_encryption_key)["crn"] : null + #skip_iam_authorization_policy = true +} + +# locals needed for file-storage +locals { + # dependency: landing_zone_vsi -> file-share + compute_subnet_id = (var.vpc_name == null && var.cluster_subnet_id == null ? local.cluster_subnets[0].id : (var.vpc_name != null && var.cluster_subnet_id != null ? [for subnet in data.ibm_is_subnet.existing_cluster_subnets : subnet.id][0] : (var.vpc_name != null && var.cluster_subnet_id == null ? local.cluster_subnets[0].id : ""))) + bastion_subnet_id = (var.enable_deployer && var.vpc_name != null && var.login_subnet_id != null) ? local.existing_login_subnets[0].id : "" + subnet_id = (var.enable_deployer && var.vpc_name != null && var.cluster_subnet_id != null) ? local.existing_cluster_subnets[0].id : "" + compute_security_group_id = var.enable_deployer ? [] : module.landing_zone_vsi[0].compute_sg_id + + nfs_shares_map = { + for share in var.custom_file_shares : + share.mount_path => share.nfs_share + if share.nfs_share != "" && share.nfs_share != null + } + + fileset_size_map = try({ for details in var.custom_file_shares : details.mount_path => details.size }, {}) + + # Original file share map from module + original_map = var.enable_deployer ? {} : module.file_storage[0].name_mount_path_map + + # Extract keyword-to-target mapping from file share names + keyword_to_target_map = var.enable_deployer ? {} : { + for k, v in local.original_map : + split("-", k)[length(split("-", k)) - 4] => v + } + + # Build base map from custom_file_shares (excluding any with `nfs_share`) + base_fileshare_map = var.enable_deployer ? {} : { + for share in var.custom_file_shares : + share.mount_path => lookup(local.keyword_to_target_map, regex("[^/]+$", share.mount_path), null) + if( + share.nfs_share == null && + contains(keys(local.keyword_to_target_map), regex("[^/]+$", share.mount_path)) + ) + } + + # Check if "lsf" is present in the keyword map (i.e., provisioned by Terraform) + lsf_exists = contains(keys(local.keyword_to_target_map), "lsf") + + # Check if "lsf" is explicitly provided in custom_file_shares (any type) + lsf_in_shares = length([ + for share in var.custom_file_shares : + share if regex("[^/]+$", share.mount_path) == "lsf" + ]) > 0 + + # Final VPC fileshare map with /mnt/lsf auto-added only if it's not already in custom_file_shares AND was provisioned by Terraform + fileshare_name_mount_path_map = var.enable_deployer ? {} : merge( + local.base_fileshare_map, + ( + local.lsf_exists && !local.lsf_in_shares ? + { "/mnt/lsf" = local.keyword_to_target_map["lsf"] } : + {} + ) + ) + + valid_lsf_shares = [ + for share in var.custom_file_shares : + { + mount_path = "/mnt/lsf" + nfs_share = share.nfs_share + } + if share.mount_path == "/mnt/lsf" && share.nfs_share != "" && share.nfs_share != null + ] + + valid_default_vpc_share = [ + for share in var.custom_file_shares : + { + mount_path = "/mnt/lsf" + size = share.size + iops = share.size + } + if share.mount_path == "/mnt/lsf" && share.size != null && share.iops != null + ] + default_share = local.management_instance_count > 0 && length(local.valid_lsf_shares) == 0 && length(local.valid_default_vpc_share) == 0 ? [ + { + mount_path = "/mnt/lsf" + size = 100 + iops = 1000 + } + ] : [] + + vpc_file_share = [ + for share in var.custom_file_shares : + { + mount_path = share.mount_path + size = share.size + iops = share.iops + } + if share.size != null && share.iops != null && share.mount_path != "/mnt/lsf" + ] + + total_shares = concat(length(local.valid_default_vpc_share) == 1 ? local.valid_default_vpc_share : local.default_share, local.vpc_file_share) + file_shares = [ + for count in range(length(local.total_shares)) : + { + name = format("%s-%s", var.cluster_prefix, element(split("/", local.total_shares[count]["mount_path"]), length(split("/", local.total_shares[count]["mount_path"])) - 1)) + size = local.total_shares[count]["size"] + iops = local.total_shares[count]["iops"] + } + ] +} + +# locals needed for DNS +locals { + # dependency: landing_zone -> DNS + # resource_group = var.existing_resource_group == null ? "workload-rg" : var.existing_resource_group + resource_group_ids = { + # management_rg = var.existing_resource_group == null ? module.landing_zone.resource_group_id[0]["management-rg"] : one(values(one(module.landing_zone.resource_group_id))) + service_rg = var.enable_deployer ? (var.existing_resource_group == "null" ? module.landing_zone.resource_group_id[0]["${var.cluster_prefix}-service-rg"] : one(values(one(module.landing_zone.resource_group_id)))) : "" + workload_rg = var.enable_deployer ? (var.existing_resource_group == "null" ? module.landing_zone.resource_group_id[0]["${var.cluster_prefix}-workload-rg"] : one(values(one(module.landing_zone.resource_group_id)))) : "" + } + # resource_group_id = one(values(one(module.landing_zone.resource_group_id))) + vpc_crn = var.vpc_name == null ? one(module.landing_zone.vpc_crn) : one(data.ibm_is_vpc.existing_vpc[*].crn) + # TODO: Fix existing subnet logic + #subnets_crn = var.vpc_name == null ? module.landing_zone.subnets_crn : ### + existing_compute_subnet_crns = [for subnet in data.ibm_is_subnet.existing_cluster_subnets : subnet.crn] + existing_storage_subnet_crns = [for subnet in data.ibm_is_subnet.existing_storage_subnets : subnet.crn] + existing_protocol_subnet_crns = [for subnet in data.ibm_is_subnet.existing_protocol_subnets : subnet.crn] + existing_client_subnet_crns = [for subnet in data.ibm_is_subnet.existing_client_subnets : subnet.crn] + existing_bastion_subnet_crns = [for subnet in data.ibm_is_subnet.existing_login_subnets : subnet.crn] + subnets_crn = concat(local.existing_compute_subnet_crns, local.existing_storage_subnet_crns, local.existing_protocol_subnet_crns, local.existing_client_subnet_crns, local.existing_bastion_subnet_crns) + # subnets_crn = var.vpc_name == null && var.cluster_subnet_id == null ? module.landing_zone.subnets_crn : concat(local.existing_subnet_crns, module.landing_zone.subnets_crn) + # subnets = flatten([local.cluster_subnets, local.storage_subnets, local.protocol_subnets]) + # subnets_crns = data.ibm_is_subnet.itself[*].crn + # subnets_crn = module.landing_zone.subnets_crn + # boot_volume_encryption_key = local.key_management != null ? one(module.landing_zone.boot_volume_encryption_key)["crn"] : null + + # dependency: landing_zone_vsi -> file-share +} + +# locals needed for dns-records +locals { + # dependency: dns -> dns-records + dns_instance_id = var.enable_deployer ? "" : module.dns[0].dns_instance_id + # dns_custom_resolver_id = var.enable_deployer ? "" : module.dns[0].dns_custom_resolver_id + dns_zone_map_list = var.enable_deployer ? [] : module.dns[0].dns_zone_maps + compute_dns_zone_id = one(flatten([ + for dns_zone in local.dns_zone_map_list : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_names["compute"] + ])) + storage_dns_zone_id = one(flatten([ + for dns_zone in local.dns_zone_map_list : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_names["storage"] + ])) + protocol_dns_zone_id = one(flatten([ + for dns_zone in local.dns_zone_map_list : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_names["protocol"] + ])) + client_dns_zone_id = one(flatten([ + for dns_zone in local.dns_zone_map_list : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_names["client"] + ])) + gklm_dns_zone_id = one(flatten([ + for dns_zone in local.dns_zone_map_list : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_names["gklm"] + ])) + + # dependency: landing_zone_vsi -> dns-records + deployer_instances = [ + { + name = var.deployer_hostname + ipv4_address = var.deployer_ip + } + ] + + compute_dns_records = [ + for instance in concat(local.compute_instances, local.comp_mgmt_instances, local.deployer_instances, local.login_instance) : + { + name = instance["name"] + rdata = instance["ipv4_address"] + } + ] + storage_dns_records = [ + for instance in concat(local.storage_instances, local.protocol_instances, local.afm_instances, local.tie_brkr_instances, local.strg_mgmt_instances, local.storage_servers) : + { + name = instance["name"] + rdata = instance["ipv4_address"] + } + ] + client_dns_records = [ + for instance in local.client_instances : + { + name = instance["name"] + rdata = instance["ipv4_address"] + } + ] + gklm_dns_records = [ + for instance in local.gklm_instances : + { + name = instance["name"] + rdata = instance["ipv4_address"] + } + ] +} + +# locals needed for inventory +locals { + compute_hosts = try([for name in local.compute_instances[*]["name"] : "${name}.${var.dns_domain_names["compute"]}"], []) + # storage_hosts = try([for name in local.storage_instances[*]["name"] : "${name}.${var.dns_domain_names["storage"]}"], []) + ldap_hosts = try([for instance in local.ldap_instances : instance["ipv4_address"]], []) + login_host_ip = try([for instance in local.login_instance : instance["ipv4_address"]], []) + compute_inventory_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/compute.ini" : "${path.root}/modules/ansible-roles/compute.ini" + compute_hosts_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/compute_hosts.ini" : "${path.root}/solutions/lsf/compute_hosts.ini" + mgmt_hosts_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/mgmt_hosts.ini" : "${path.root}/solutions/lsf/mgmt_hosts.ini" + bastion_hosts_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/bastion_hosts.ini" : "${path.root}/solutions/lsf/bastion_hosts.ini" + deployer_hosts_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/deployer_hosts.ini" : "${path.root}/solutions/lsf/deployer_hosts.ini" + ldap_hosts_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/ldap_hosts.ini" : "${path.root}/solutions/lsf/ldap_hosts.ini" + login_host_inventory_path = var.enable_deployer ? "${path.root}/../../solutions/lsf/login_host.ini" : "${path.root}/solutions/lsf/login_host.ini" + # storage_inventory_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/storage.ini" : "${path.root}/modules/ansible-roles/storage.ini" +} + +# locals needed for playbook +locals { + bastion_fip = module.deployer.bastion_fip + compute_private_key_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/compute_id_rsa" : "${path.root}/modules/ansible-roles/compute_id_rsa" #checkov:skip=CKV_SECRET_6 + # storage_private_key_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/storage_id_rsa" : "${path.root}/modules/ansible-roles/storage_id_rsa" #checkov:skip=CKV_SECRET_6 + observability_playbook_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/observability.yaml" : "${path.root}/modules/ansible-roles/observability.yaml" + lsf_mgmt_playbooks_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/lsf_mgmt_config.yml" : "${path.root}/modules/ansible-roles/lsf_mgmt_config.yml" + playbooks_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/" : "${path.root}/modules/ansible-roles" + # storage_playbook_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/storage_ssh.yaml" : "${path.root}/modules/ansible-roles/storage_ssh.yaml" +} + +# file Share OutPut +locals { + cloud_logs_ingress_private_endpoint = var.enable_deployer ? "" : module.cloud_monitoring_instance_creation[0].cloud_logs_ingress_private_endpoint +} + +# details needed for json file +locals { + compute_instances_data = var.scheduler == "LSF" ? var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].compute_vsi_data]) : [] + compute_hosts_ips = var.scheduler == "LSF" ? var.enable_deployer ? [] : local.compute_instances_data[*]["ipv4_address"] : [] + # bastion_instances_data = var.scheduler == "LSF" ? var.enable_deployer ? flatten([module.deployer.bastion_vsi_data]) : [] : [] + bastion_hosts_ips = var.scheduler == "LSF" ? var.enable_deployer ? [module.deployer.bastion_fip] : [] : [] + deployer_hosts_ips = var.scheduler == "LSF" ? var.enable_deployer ? [module.deployer.deployer_ip] : [] : [] + mgmt_instances_data = var.scheduler == "LSF" ? var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].management_vsi_data]) : [] + mgmt_hosts_ips = var.scheduler == "LSF" ? var.enable_deployer ? [] : local.mgmt_instances_data[*]["ipv4_address"] : [] + ldap_hosts_ips = var.scheduler == "LSF" ? var.enable_deployer ? [] : (var.enable_ldap == true ? (var.ldap_server == "null" ? local.ldap_instances[*]["ipv4_address"] : [var.ldap_server]) : []) : [] + json_inventory_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/all.json" : "${path.root}/modules/ansible-roles/all.json" + management_nodes = var.scheduler == "LSF" ? var.enable_deployer ? [] : (flatten([module.landing_zone_vsi[0].management_vsi_data]))[*]["name"] : [] + login_host = var.scheduler == "LSF" ? var.enable_deployer ? [] : (flatten([module.landing_zone_vsi[0].login_vsi_data]))[*]["name"] : [] + compute_nodes = var.scheduler == "LSF" ? ( + var.enable_deployer ? [] : flatten([module.landing_zone_vsi[0].compute_vsi_data])[*]["name"] + ) : [] + + compute_nodes_list = var.scheduler == "LSF" ? ( + var.enable_deployer ? [] : ( + length(local.compute_nodes) == 0 ? [] : distinct(flatten([ + for prefix, nodes in { + for node in local.compute_nodes : + join("-", slice(split("-", node), 0, length(split("-", node)) - 1)) => node... + } : length(nodes) > 1 ? + [format( + "%s-[%s:%s]", + prefix, + split("-", nodes[0])[length(split("-", nodes[0])) - 1], + split("-", nodes[length(nodes) - 1])[length(split("-", nodes[length(nodes) - 1])) - 1] + )] : nodes + ])) + ) + ) : [] + + client_nodes = var.scheduler == "LSF" ? var.enable_deployer ? [] : (flatten([module.landing_zone_vsi[0].client_vsi_data]))[*]["name"] : [] + gui_hosts = var.scheduler == "LSF" ? var.enable_deployer ? [] : [local.management_nodes[0]] : [] # Without Pac HA + db_hosts = var.scheduler == "LSF" ? var.enable_deployer ? [] : [local.management_nodes[0]] : [] # Without Pac HA + ha_shared_dir = var.scheduler == "LSF" ? "/mnt/lsf" : "" + nfs_install_dir = var.scheduler == "LSF" ? "none" : "" + enable_monitoring = var.scheduler == "LSF" ? false : false + lsf_deployer_hostname = var.scheduler == "LSF" ? var.deployer_hostname : "" + + cloud_logs_bucket = length([for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "logs-data-bucket")]) > 0 ? [for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "logs-data-bucket")][0] : null + cloud_metrics_bucket = length([for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "metrics-data-bucket")]) > 0 ? [for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "metrics-data-bucket")][0] : null + cloud_logs_data_bucket = jsonencode(local.cloud_logs_bucket != null ? { + bucket_crn = local.cloud_logs_bucket.crn + bucket_endpoint = local.cloud_logs_bucket.s3_endpoint_direct + } : null) + cloud_metrics_data_bucket = jsonencode(local.cloud_metrics_bucket != null ? { + bucket_crn = local.cloud_metrics_bucket.crn + bucket_endpoint = local.cloud_metrics_bucket.s3_endpoint_direct + } : null) + # scc_cos_bucket = length(module.landing_zone.cos_buckets_names) > 0 && var.scc_enable ? [for name in module.landing_zone.cos_buckets_names : name if strcontains(name, "scc-bucket")][0] : "" + # scc_cos_instance_crn = length(module.landing_zone.cos_instance_crns) > 0 && var.scc_enable ? module.landing_zone.cos_instance_crns[0] : "" + + compute_subnet_crn = var.enable_deployer ? "" : data.ibm_is_subnet.compute_subnet_crn[0].crn + ssh_keys_ids = var.enable_deployer ? [] : [for name in var.ssh_keys : data.ibm_is_ssh_key.ssh_keys[name].id] + compute_public_key_content = var.enable_deployer ? "" : jsonencode(base64encode(join("", flatten([module.landing_zone_vsi[0].compute_public_key_content])))) + compute_private_key_content = var.enable_deployer ? "" : jsonencode(base64encode(join("", flatten([module.landing_zone_vsi[0].compute_private_key_content])))) + + mgmnt_host_entry = var.scheduler == "LSF" ? { for vsi in flatten([module.landing_zone_vsi[*].management_vsi_data]) : vsi.ipv4_address => vsi.name } : {} + comp_host_entry = var.scheduler == "LSF" ? { for vsi in flatten([module.landing_zone_vsi[*].compute_vsi_data]) : vsi.ipv4_address => vsi.name } : {} + login_host_entry = var.scheduler == "LSF" ? { for vsi in flatten([module.landing_zone_vsi[*].login_vsi_data]) : vsi.ipv4_address => vsi.name } : {} + deployer_host_entry = var.scheduler == "LSF" ? { for inst in local.deployer_instances : inst.ipv4_address => inst.name if inst.ipv4_address != null } : {} +} + +locals { + # gpfs_base_rpm_path = fileset(var.spectrumscale_rpms_path, "gpfs.base-*") + # scale_org_version = regex("gpfs.base-(.*).x86_64.rpm", tolist(local.gpfs_base_rpm_path)[0])[0] + scale_version = "5.2.2.1" #replace(local.scale_org_version, "-", ".") + + compute_vsi_profile = var.static_compute_instances[*]["profile"] + storage_vsi_profile = var.storage_instances[*]["profile"] + storage_bms_profile = var.storage_servers[*]["profile"] + management_vsi_profile = var.management_instances[*]["profile"] + afm_vsi_profile = var.afm_instances[*]["profile"] + protocol_vsi_profile = var.protocol_instances[*]["profile"] + afm_server_type = strcontains(local.afm_vsi_profile[0], "metal") + ces_server_type = strcontains(local.protocol_vsi_profile[0], "metal") + + scale_ces_enabled = local.protocol_instance_count > 0 ? true : false + is_colocate_protocol_subset = local.scale_ces_enabled && var.colocate_protocol_instances ? local.protocol_instance_count < local.storage_instance_count ? true : false : false + enable_sec_interface_compute = local.scale_ces_enabled == false && data.ibm_is_instance_profile.compute_profile.bandwidth[0].value >= 64000 ? true : false + enable_sec_interface_storage = local.scale_ces_enabled == false && var.storage_type != "persistent" && data.ibm_is_instance_profile.storage_profile.bandwidth[0].value >= 64000 ? true : false + enable_mrot_conf = local.enable_sec_interface_compute && local.enable_sec_interface_storage ? true : false + enable_afm = sum(var.afm_instances[*]["count"]) > 0 ? true : false + + compute_instance_private_ips = flatten(local.compute_instances[*]["ipv4_address"]) + compute_instance_ids = flatten(local.compute_instances[*]["id"]) + compute_instance_names = try(tolist([for name_details in flatten(local.compute_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["compute"]}"]), []) + + compute_mgmt_instance_private_ips = flatten(local.comp_mgmt_instances[*]["ipv4_address"]) + compute_mgmt_instance_ids = flatten(local.comp_mgmt_instances[*]["id"]) + compute_mgmt_instance_names = try(tolist([for name_details in flatten(local.comp_mgmt_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["compute"]}"]), []) + + strg_instance_private_ips = flatten(local.storage_instances[*]["ipv4_address"]) + strg_instance_ids = flatten(local.storage_instances[*]["id"]) + strg_instance_names = try(tolist([for name_details in flatten(local.storage_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + strg_servers_private_ips = flatten(local.storage_servers[*]["ipv4_address"]) + strg_servers_ids = flatten(local.storage_servers[*]["id"]) + strg_servers_names = try(tolist([for name_details in flatten(local.storage_servers[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + strg_mgmt_instance_private_ips = flatten(local.strg_mgmt_instances[*]["ipv4_address"]) + strg_mgmtt_instance_ids = flatten(local.strg_mgmt_instances[*]["id"]) + strg_mgmt_instance_names = try(tolist([for name_details in flatten(local.strg_mgmt_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + strg_tie_breaker_private_ips = flatten(local.tie_brkr_instances[*]["ipv4_address"]) + strg_tie_breaker_instance_ids = flatten(local.tie_brkr_instances[*]["id"]) + strg_tie_breaker_instance_names = try(tolist([for name_details in flatten(local.tie_brkr_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + secondary_compute_instance_private_ips = flatten(local.compute_instances[*]["secondary_ipv4_address"]) + # secondary_storage_instance_private_ips = flatten(local.storage_instances[*]["secondary_ipv4_address"]) + + protocol_instance_private_ips = flatten(local.protocol_instances[*]["ipv4_address"]) + protocol_instance_ids = flatten(local.protocol_instances[*]["id"]) + protocol_instance_names = try(tolist([for name_details in flatten(local.protocol_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + protocol_cluster_instance_names = var.enable_deployer ? [] : slice((concat(local.protocol_instance_names, (var.storage_type == "persistent" ? [] : local.strg_instance_names))), 0, local.protocol_instance_count) + + # client_instance_private_ips = flatten(local.client_instances[*]["ipv4_address"]) + # client_instance_ids = flatten(local.client_instances[*]["id"]) + client_instance_names = try(tolist([for name_details in flatten(local.client_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["client"]}"]), []) + + gklm_instance_private_ips = flatten(local.gklm_instances[*]["ipv4_address"]) + # gklm_instance_ids = flatten(local.gklm_instances[*]["id"]) + # gklm_instance_names = try(tolist([for name_details in flatten(local.gklm_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + ldap_instance_private_ips = flatten(local.ldap_instances[*]["ipv4_address"]) + # ldap_instance_ids = flatten(local.ldap_instances[*]["id"]) + # ldap_instance_names = flatten(local.ldap_instances[*]["name"]) +} + +locals { + afm_instance_private_ips = flatten(local.afm_instances[*]["ipv4_address"]) + afm_instance_ids = flatten(local.afm_instances[*]["id"]) + afm_instance_names = try(tolist([for name_details in flatten(local.afm_instances[*]["name"]) : "${name_details}.${var.dns_domain_names["storage"]}"]), []) + + new_instance_bucket_hmac = [for details in var.afm_cos_config : details if(details.cos_instance == "" && details.bucket_name == "" && details.cos_service_cred_key == "")] + exstng_instance_new_bucket_hmac = [for details in var.afm_cos_config : details if(details.cos_instance != "" && details.bucket_name == "" && details.cos_service_cred_key == "")] + exstng_instance_bucket_new_hmac = [for details in var.afm_cos_config : details if(details.cos_instance != "" && details.bucket_name != "" && details.cos_service_cred_key == "")] + exstng_instance_hmac_new_bucket = [for details in var.afm_cos_config : details if(details.cos_instance != "" && details.bucket_name == "" && details.cos_service_cred_key != "")] + exstng_instance_bucket_hmac = [for details in var.afm_cos_config : details if(details.cos_instance != "" && details.bucket_name != "" && details.cos_service_cred_key != "")] + + afm_cos_bucket_details = local.enable_afm == true ? flatten(module.cos[*].afm_cos_bucket_details) : [] + afm_cos_config = local.enable_afm == true ? flatten(module.cos[*].afm_config_details) : [] +} + + +locals { + + storage_instance_private_ips = var.storage_type != "persistent" ? local.enable_afm == true ? concat(local.strg_instance_private_ips, local.afm_instance_private_ips) : local.strg_instance_private_ips : [] + storage_instance_ids = var.storage_type != "persistent" ? local.enable_afm == true ? concat(local.strg_instance_ids, local.afm_instance_ids) : local.strg_instance_ids : [] + storage_instance_names = var.storage_type != "persistent" ? local.enable_afm == true ? concat(local.strg_instance_names, local.afm_instance_names) : local.strg_instance_names : [] + storage_ips_with_vol_mapping = module.landing_zone_vsi[*].instance_ips_with_vol_mapping + + storage_cluster_instance_private_ips = local.scale_ces_enabled == false ? local.storage_instance_private_ips : concat(local.storage_instance_private_ips, local.protocol_instance_private_ips) + storage_cluster_instance_ids = local.scale_ces_enabled == false ? local.storage_instance_ids : concat(local.storage_instance_ids, local.protocol_instance_ids) + storage_cluster_instance_names = local.scale_ces_enabled == false ? local.storage_instance_names : concat(local.storage_instance_names, local.protocol_instance_names) + + baremetal_instance_private_ips = var.storage_type == "persistent" ? local.enable_afm == true ? concat(local.strg_servers_private_ips, local.afm_instance_private_ips) : local.strg_servers_private_ips : [] + baremetal_instance_ids = var.storage_type == "persistent" ? local.enable_afm == true ? concat(local.strg_servers_ids, local.afm_instance_ids) : local.strg_servers_ids : [] + baremetal_instance_names = var.storage_type == "persistent" ? local.enable_afm == true ? concat(local.strg_servers_names, local.afm_instance_names) : local.strg_servers_names : [] + + baremetal_cluster_instance_private_ips = var.storage_type == "persistent" && local.scale_ces_enabled == false ? local.baremetal_instance_private_ips : concat(local.baremetal_instance_private_ips, local.protocol_instance_private_ips) + baremetal_cluster_instance_ids = var.storage_type == "persistent" && local.scale_ces_enabled == false ? local.baremetal_instance_ids : concat(local.baremetal_instance_ids, local.protocol_instance_ids) + baremetal_cluster_instance_names = var.storage_type == "persistent" && local.scale_ces_enabled == false ? local.baremetal_instance_names : concat(local.baremetal_instance_names, local.protocol_instance_names) + + tie_breaker_storage_instance_private_ips = var.storage_type != "persistent" ? local.strg_tie_breaker_private_ips : local.baremetal_instance_private_ips + tie_breaker_storage_instance_ids = var.storage_type != "persistent" ? local.strg_tie_breaker_instance_ids : local.baremetal_instance_ids + tie_breaker_storage_instance_names = var.storage_type != "persistent" ? local.strg_tie_breaker_instance_names : local.baremetal_instance_names + tie_breaker_ips_with_vol_mapping = module.landing_zone_vsi[*].instance_ips_with_vol_mapping_tie_breaker + + storage_subnet_cidr = var.enable_deployer ? "" : local.storage_instance_count > 0 ? jsonencode((data.ibm_is_subnet.existing_storage_subnets[*].ipv4_cidr_block)[0]) : "" + cluster_subnet_cidr = var.enable_deployer ? "" : jsonencode((data.ibm_is_subnet.existing_cluster_subnets[*].ipv4_cidr_block)[0]) + client_subnet_cidr = var.enable_deployer ? "" : local.client_instance_count > 0 ? jsonencode((data.ibm_is_subnet.existing_client_subnets[*].ipv4_cidr_block)[0]) : "" + + compute_memory = data.ibm_is_instance_profile.compute_profile.memory[0].value + compute_vcpus_count = data.ibm_is_instance_profile.compute_profile.vcpu_count[0].value + compute_bandwidth = data.ibm_is_instance_profile.compute_profile.bandwidth[0].value + management_memory = data.ibm_is_instance_profile.management_profile.memory[0].value + management_vcpus_count = data.ibm_is_instance_profile.management_profile.vcpu_count[0].value + management_bandwidth = data.ibm_is_instance_profile.management_profile.bandwidth[0].value + storage_desc_memory = data.ibm_is_instance_profile.storage_profile.memory[0].value + storage_desc_vcpus_count = data.ibm_is_instance_profile.storage_profile.vcpu_count[0].value + storage_desc_bandwidth = data.ibm_is_instance_profile.storage_profile.bandwidth[0].value + storage_memory = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].memory[0].value : data.ibm_is_instance_profile.storage_profile.memory[0].value + storage_vcpus_count = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].cpu_core_count[0].value : data.ibm_is_instance_profile.storage_profile.vcpu_count[0].value + storage_bandwidth = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].bandwidth[0].value : data.ibm_is_instance_profile.storage_profile.bandwidth[0].value + protocol_memory = (local.scale_ces_enabled == true && var.colocate_protocol_instances == false) ? local.ces_server_type == false ? data.ibm_is_instance_profile.protocol_profile[0].memory[0].value : jsonencode(0) : jsonencode(0) + protocol_vcpus_count = (local.scale_ces_enabled == true && var.colocate_protocol_instances == false) ? local.ces_server_type == false ? data.ibm_is_instance_profile.protocol_profile[0].vcpu_count[0].value : jsonencode(0) : jsonencode(0) + protocol_bandwidth = (local.scale_ces_enabled == true && var.colocate_protocol_instances == false) ? local.ces_server_type == false ? data.ibm_is_instance_profile.protocol_profile[0].bandwidth[0].value : jsonencode(0) : jsonencode(0) + storage_protocol_memory = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].memory[0].value : data.ibm_is_instance_profile.storage_profile.memory[0].value + storage_protocol_vcpus_count = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].cpu_core_count[0].value : data.ibm_is_instance_profile.storage_profile.vcpu_count[0].value + storage_protocol_bandwidth = var.storage_type == "persistent" ? data.ibm_is_bare_metal_server_profile.storage_bms_profile[0].bandwidth[0].value : data.ibm_is_instance_profile.storage_profile.bandwidth[0].value + afm_memory = local.afm_server_type == true ? jsonencode("") : data.ibm_is_instance_profile.afm_server_profile[0].memory[0].value + afm_vcpus_count = local.afm_server_type == true ? jsonencode("") : data.ibm_is_instance_profile.afm_server_profile[0].vcpu_count[0].value + afm_bandwidth = local.afm_server_type == true ? jsonencode("") : data.ibm_is_instance_profile.afm_server_profile[0].bandwidth[0].value + + protocol_reserved_name_ips_map = try({ for details in data.ibm_is_subnet_reserved_ips.protocol_subnet_reserved_ips[0].reserved_ips : details.name => details.address }, {}) + protocol_subnet_gateway_ip = local.scale_ces_enabled == true ? local.protocol_reserved_name_ips_map.ibm-default-gateway : "" +} + +# Existing bastion Variables +locals { + bastion_instance_public_ip = var.existing_bastion_instance_name != null ? var.existing_bastion_instance_public_ip : null + bastion_ssh_private_key = var.existing_bastion_instance_name != null ? var.existing_bastion_ssh_private_key : null +} + +locals { + existing_vpc_cidr = var.vpc_name != null ? data.ibm_is_vpc_address_prefixes.existing_vpc_cidr[0].address_prefixes[0].cidr : null + cluster_cidr = var.vpc_name == null ? var.vpc_cidr : local.existing_vpc_cidr +} + +# locals needed for ssh connection +locals { + ssh_forward_host = var.enable_deployer ? "" : local.mgmt_hosts_ips[0] + ssh_forwards = var.enable_deployer ? "" : "-L 8443:${local.ssh_forward_host}:8443 -L 6080:${local.ssh_forward_host}:6080 -L 8444:${local.ssh_forward_host}:8444" + ssh_jump_host = var.enable_deployer ? "" : local.bastion_instance_public_ip != null ? local.bastion_instance_public_ip : var.bastion_fip + ssh_jump_option = var.enable_deployer ? "" : "-J ubuntu@${local.ssh_jump_host}" + ssh_cmd = var.enable_deployer ? "" : "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 ${local.ssh_forwards} ${local.ssh_jump_option} lsfadmin@${join(",", local.login_host_ip)}" +} + +#locals { +# cloud_monitoring_instance_crn = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_crn : null +#} + +# locals { +# cloud_monitoring_instance_crn = var.enable_deployer ? "" : var.observability_monitoring_enable && length(module.cloud_monitoring_instance_creation) > 0 ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_crn : null +# } diff --git a/main.tf b/main.tf new file mode 100644 index 00000000..6dbfd1a6 --- /dev/null +++ b/main.tf @@ -0,0 +1,805 @@ +module "landing_zone" { + source = "./modules/landing_zone" + enable_landing_zone = var.enable_landing_zone + vpc_cluster_private_subnets_cidr_blocks = [var.vpc_cluster_private_subnets_cidr_blocks] + cos_instance_name = var.cos_instance_name + bastion_subnet_id = local.bastion_subnet_id + compute_subnet_id = local.subnet_id + enable_atracker = var.observability_atracker_enable && (var.observability_atracker_target_type == "cos") ? true : false + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + key_management = local.key_management + kms_instance_name = var.kms_instance_name + kms_key_name = var.kms_key_name + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + management_instances = var.management_instances + compute_instances = var.static_compute_instances + cluster_cidr = local.cluster_cidr + placement_strategy = var.placement_strategy + prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + existing_resource_group = var.existing_resource_group + storage_instances = var.storage_instances + storage_servers = var.storage_servers + storage_subnets_cidr = var.storage_subnets_cidr + storage_type = var.storage_type + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + vpc_name = var.vpc_name + zones = var.zones + enable_vpn = var.vpn_enabled + skip_flowlogs_s2s_auth_policy = var.skip_flowlogs_s2s_auth_policy + skip_kms_s2s_auth_policy = var.skip_kms_s2s_auth_policy + observability_logs_enable = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute || (var.observability_atracker_enable && var.observability_atracker_target_type == "cloudlogs") ? true : false + # hpcs_instance_name = var.hpcs_instance_name + # clusters = var.clusters +} + +module "deployer" { + source = "./modules/deployer" + scheduler = var.scheduler + resource_group = local.resource_group_ids["workload_rg"] + prefix = var.cluster_prefix + vpc_id = local.vpc_id + zones = var.zones + cluster_cidr = local.cluster_cidr + ext_login_subnet_id = var.login_subnet_id + bastion_subnets = local.login_subnets + ext_cluster_subnet_id = var.cluster_subnet_id + cluster_subnets = local.cluster_subnets + bastion_instance = var.bastion_instance + enable_deployer = var.enable_deployer + deployer_instance = var.deployer_instance + ssh_keys = var.ssh_keys + allowed_cidr = var.remote_allowed_ips + kms_encryption_enabled = local.kms_encryption_enabled + boot_volume_encryption_key = local.boot_volume_encryption_key + existing_kms_instance_guid = local.existing_kms_instance_guid + dns_domain_names = var.dns_domain_names + skip_iam_authorization_policy = var.skip_iam_block_storage_authorization_policy + ext_vpc_name = var.vpc_name + bastion_instance_name = var.existing_bastion_instance_name + bastion_instance_public_ip = local.bastion_instance_public_ip + existing_bastion_security_group_id = var.existing_bastion_instance_name != null ? var.existing_bastion_security_group_id : null +} + +module "landing_zone_vsi" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/landing_zone_vsi" + resource_group = var.resource_group_ids["workload_rg"] + prefix = var.cluster_prefix + vpc_id = local.vpc_id + zones = var.zones + bastion_security_group_id = var.bastion_security_group_id + bastion_public_key_content = local.bastion_public_key_content + ssh_keys = var.ssh_keys + client_subnets = local.client_subnets + client_instances = var.client_instances + cluster_subnet_id = local.cluster_subnets + management_instances = var.management_instances + static_compute_instances = var.static_compute_instances + dynamic_compute_instances = var.dynamic_compute_instances + storage_subnets = local.storage_subnets + storage_instances = var.storage_instances + storage_servers = var.storage_servers + storage_type = var.storage_type + protocol_subnets = local.protocol_subnets + protocol_instances = var.protocol_instances + nsd_details = var.nsd_details + dns_domain_names = var.dns_domain_names + kms_encryption_enabled = local.kms_encryption_enabled + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + enable_deployer = var.enable_deployer + afm_instances = var.afm_instances + enable_dedicated_host = var.enable_dedicated_host + enable_ldap = var.enable_ldap + ldap_instances = var.ldap_instance + ldap_server = local.ldap_server + ldap_instance_key_pair = local.ldap_instance_key_pair + scale_encryption_enabled = var.scale_encryption_enabled + scale_encryption_type = var.scale_encryption_type + gklm_instance_key_pair = local.gklm_instance_key_pair + gklm_instances = var.gklm_instances + vpc_region = local.region + scheduler = var.scheduler + ibm_customer_number = var.ibm_customer_number + colocate_protocol_instances = var.colocate_protocol_instances + storage_security_group_id = var.storage_security_group_id + login_instance = var.login_instance + bastion_subnets = local.login_subnets + cluster_cidr = local.cluster_cidr +} + +module "prepare_tf_input" { + source = "./modules/prepare_tf_input" + scheduler = var.scheduler + enable_deployer = var.enable_deployer + deployer_ip = local.deployer_ip + bastion_fip = local.bastion_fip + ibmcloud_api_key = var.ibmcloud_api_key + app_center_gui_password = var.app_center_gui_password + lsf_version = var.lsf_version + resource_group_ids = local.resource_group_ids + cluster_prefix = var.cluster_prefix + zones = var.zones + ssh_keys = local.ssh_keys + storage_instances = var.storage_instances + storage_servers = var.storage_servers + storage_type = var.storage_type + management_instances = var.management_instances + protocol_instances = var.protocol_instances + colocate_protocol_instances = var.colocate_protocol_instances + ibm_customer_number = var.ibm_customer_number + static_compute_instances = var.static_compute_instances + dynamic_compute_instances = var.dynamic_compute_instances + client_instances = var.client_instances + enable_cos_integration = var.enable_cos_integration + enable_atracker = var.enable_atracker + enable_vpc_flow_logs = var.enable_vpc_flow_logs + enable_dedicated_host = var.enable_dedicated_host + remote_allowed_ips = var.remote_allowed_ips + vpc_name = local.vpc_name + storage_subnets = local.storage_subnet + protocol_subnets = local.protocol_subnet + cluster_subnet_id = local.cluster_subnet + client_subnets = local.client_subnet + login_subnet_id = local.login_subnet + login_instance = var.login_instance + dns_domain_names = var.dns_domain_names + key_management = local.key_management + kms_instance_name = var.kms_instance_name + kms_key_name = var.kms_key_name + boot_volume_encryption_key = local.boot_volume_encryption_key + existing_kms_instance_guid = local.existing_kms_instance_guid + skip_iam_share_authorization_policy = var.skip_iam_share_authorization_policy + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + bastion_security_group_id = local.bastion_security_group_id + deployer_hostname = local.deployer_hostname + enable_hyperthreading = var.enable_hyperthreading + cloud_logs_data_bucket = local.cloud_logs_data_bucket + cloud_metrics_data_bucket = local.cloud_metrics_data_bucket + observability_logs_enable_for_management = var.observability_logs_enable_for_management + observability_logs_enable_for_compute = var.observability_logs_enable_for_compute + observability_enable_platform_logs = var.observability_enable_platform_logs + observability_monitoring_enable = var.observability_monitoring_enable + observability_monitoring_plan = var.observability_monitoring_plan + observability_logs_retention_period = var.observability_logs_retention_period + observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable + observability_enable_metrics_routing = var.observability_enable_metrics_routing + observability_atracker_enable = var.observability_atracker_enable + observability_atracker_target_type = var.observability_atracker_target_type + enable_ldap = var.enable_ldap + ldap_instance = var.ldap_instance + ldap_server = local.ldap_server + ldap_basedns = var.ldap_basedns + ldap_server_cert = local.ldap_server_cert + ldap_admin_password = local.ldap_admin_password + ldap_instance_key_pair = local.ldap_instance_key_pair + ldap_user_password = var.ldap_user_password + ldap_user_name = var.ldap_user_name + afm_instances = var.afm_instances + afm_cos_config = var.afm_cos_config + gklm_instance_key_pair = local.gklm_instance_key_pair + gklm_instances = var.gklm_instances + scale_encryption_type = var.scale_encryption_type + filesystem_config = var.filesystem_config + scale_encryption_admin_password = var.scale_encryption_admin_password + scale_encryption_enabled = var.scale_encryption_enabled + storage_security_group_id = var.storage_security_group_id + custom_file_shares = var.custom_file_shares + existing_bastion_instance_name = var.existing_bastion_instance_name + existing_bastion_security_group_id = var.existing_bastion_security_group_id + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + sccwp_enable = var.sccwp_enable + sccwp_service_plan = var.sccwp_service_plan + cspm_enabled = var.cspm_enabled + app_config_plan = var.app_config_plan + existing_resource_group = var.existing_resource_group + depends_on = [module.deployer] +} + +module "validate_ldap_server_connection" { + count = var.enable_deployer && var.enable_ldap && local.ldap_server != "null" ? 1 : 0 + source = "./modules/ldap_remote_exec" + ldap_server = local.ldap_server + bastion_fip = local.bastion_fip + bastion_private_key_content = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content + deployer_ip = local.deployer_ip + depends_on = [module.deployer] +} + +module "resource_provisioner" { + source = "./modules/resource_provisioner" + ibmcloud_api_key = var.ibmcloud_api_key + enable_deployer = var.enable_deployer + cluster_prefix = var.cluster_prefix + bastion_fip = local.bastion_fip + bastion_private_key_content = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content + deployer_ip = local.deployer_ip + scheduler = var.scheduler + existing_bastion_instance_name = var.existing_bastion_instance_name + bastion_public_key_content = local.bastion_public_key_content + depends_on = [module.deployer, module.prepare_tf_input, module.validate_ldap_server_connection] +} + +module "cos" { + count = var.scheduler == "Scale" && local.enable_afm == true ? 1 : 0 + source = "./modules/cos" + prefix = "${var.cluster_prefix}-" + resource_group_id = local.resource_group_ids["service_rg"] + cos_instance_plan = "standard" + cos_instance_location = "global" + cos_instance_service = "cloud-object-storage" + cos_hmac_role = "Manager" + new_instance_bucket_hmac = local.new_instance_bucket_hmac + exstng_instance_new_bucket_hmac = local.exstng_instance_new_bucket_hmac + exstng_instance_bucket_new_hmac = local.exstng_instance_bucket_new_hmac + exstng_instance_hmac_new_bucket = local.exstng_instance_hmac_new_bucket + exstng_instance_bucket_hmac = local.exstng_instance_bucket_hmac + filesystem = var.storage_instances[*]["filesystem"] != "" ? var.storage_instances[0]["filesystem"] : var.filesystem_config[0]["filesystem"] + depends_on = [module.landing_zone_vsi] +} + +module "file_storage" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/file_storage" + zone = var.zones[0] # always the first zone + resource_group_id = var.resource_group_ids["workload_rg"] + file_shares = local.file_shares + encryption_key_crn = local.boot_volume_encryption_key + security_group_ids = local.compute_security_group_id + subnet_id = local.compute_subnet_id + existing_kms_instance_guid = var.existing_kms_instance_guid + skip_iam_share_authorization_policy = var.skip_iam_share_authorization_policy + kms_encryption_enabled = local.kms_encryption_enabled +} + +module "dns" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/dns" + prefix = var.cluster_prefix + resource_group_id = var.resource_group_ids["service_rg"] + vpc_crn = local.vpc_crn + subnets_crn = local.subnets_crn + dns_instance_id = var.dns_instance_id + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_domain_names = compact(values(var.dns_domain_names)) +} + +module "compute_dns_records" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/dns_record" + dns_instance_id = local.dns_instance_id + dns_zone_id = local.compute_dns_zone_id + dns_records = local.compute_dns_records + depends_on = [module.dns] +} + +module "storage_dns_records" { + count = var.enable_deployer == false && length(var.storage_instances) > 0 ? 1 : 0 + source = "./modules/dns_record" + dns_instance_id = local.dns_instance_id + dns_zone_id = local.storage_dns_zone_id + dns_records = local.storage_dns_records + depends_on = [module.dns] +} + +module "protocol_reserved_ip" { + count = var.scheduler == "Scale" && var.enable_deployer == false && var.protocol_subnets != null ? 1 : 0 + source = "./modules/protocol_reserved_ip" + total_reserved_ips = local.protocol_instance_count + subnet_id = [local.protocol_subnets[0].id] + name = format("%s-ces", var.cluster_prefix) + protocol_domain = var.dns_domain_names["protocol"] + protocol_dns_service_id = local.dns_instance_id + protocol_dns_zone_id = local.protocol_dns_zone_id + depends_on = [module.dns] +} + +module "client_dns_records" { + count = var.enable_deployer == false && length(var.client_instances) > 0 ? 1 : 0 + source = "./modules/dns_record" + dns_instance_id = local.dns_instance_id + dns_zone_id = local.client_dns_zone_id + dns_records = local.client_dns_records + depends_on = [module.dns] +} + +module "gklm_dns_records" { + count = var.enable_deployer == false && length(var.gklm_instances) > 0 ? 1 : 0 + source = "./modules/dns_record" + dns_instance_id = local.dns_instance_id + dns_zone_id = local.gklm_dns_zone_id + dns_records = local.gklm_dns_records + depends_on = [module.dns] +} + +resource "time_sleep" "wait_60_seconds" { + create_duration = "60s" + depends_on = [module.storage_dns_records, module.protocol_reserved_ip, module.compute_dns_records] +} + +module "write_compute_cluster_inventory" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/write_inventory" + json_inventory_path = local.json_inventory_path + lsf_masters = local.management_nodes + lsf_servers = local.compute_nodes_list + lsf_clients = local.client_nodes + gui_hosts = local.gui_hosts + db_hosts = local.db_hosts + login_host = local.login_host + prefix = var.cluster_prefix + ha_shared_dir = local.ha_shared_dir + nfs_install_dir = local.nfs_install_dir + enable_monitoring = local.enable_monitoring + lsf_deployer_hostname = local.lsf_deployer_hostname + ibmcloud_api_key = var.ibmcloud_api_key + app_center_gui_password = var.app_center_gui_password + lsf_version = var.lsf_version + dns_domain_names = var.dns_domain_names + compute_public_key_content = local.compute_public_key_content + compute_private_key_content = local.compute_private_key_content + enable_hyperthreading = var.enable_hyperthreading + compute_subnet_id = local.compute_subnet_id + region = local.region + resource_group_id = var.resource_group_ids["service_rg"] + zones = var.zones + vpc_id = local.vpc_id + compute_subnets_cidr = [var.vpc_cluster_private_subnets_cidr_blocks] + dynamic_compute_instances = var.dynamic_compute_instances + compute_security_group_id = local.compute_security_group_id + compute_ssh_keys_ids = local.ssh_keys_ids + compute_subnet_crn = local.compute_subnet_crn + kms_encryption_enabled = local.kms_encryption_enabled + boot_volume_encryption_key = var.boot_volume_encryption_key + depends_on = [time_sleep.wait_60_seconds, module.landing_zone_vsi] +} + +module "write_compute_scale_cluster_inventory" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/write_scale_inventory" + json_inventory_path = var.scheduler == "Scale" ? format("%s/compute_cluster_inventory.json", var.scale_ansible_repo_clone_path) : format("%s/compute_cluster_inventory.json", local.json_inventory_path) + bastion_user = jsonencode(var.bastion_user) + bastion_instance_id = var.bastion_instance_id == null ? jsonencode("None") : jsonencode(var.bastion_instance_id) + bastion_instance_public_ip = var.bastion_fip == null ? jsonencode("None") : jsonencode(var.bastion_fip) + cloud_platform = jsonencode("IBMCloud") + resource_prefix = jsonencode(format("%s.%s", var.cluster_prefix, var.dns_domain_names["compute"])) + vpc_region = jsonencode(local.region) + vpc_availability_zones = var.zones + scale_version = jsonencode(local.scale_version) + compute_cluster_filesystem_mountpoint = jsonencode(var.scale_compute_cluster_filesystem_mountpoint) + storage_cluster_filesystem_mountpoint = jsonencode("None") + filesystem_block_size = jsonencode("None") + compute_cluster_instance_private_ips = concat((local.enable_sec_interface_compute ? local.secondary_compute_instance_private_ips : local.compute_instance_private_ips), local.compute_mgmt_instance_private_ips) + compute_cluster_instance_ids = concat((local.enable_sec_interface_compute ? local.secondary_compute_instance_private_ips : local.compute_instance_ids), local.compute_mgmt_instance_ids) + compute_cluster_instance_names = concat((local.enable_sec_interface_compute ? local.secondary_compute_instance_private_ips : local.compute_instance_names), local.compute_mgmt_instance_names) + compute_cluster_instance_private_dns_ip_map = {} + storage_cluster_instance_ids = [] + storage_cluster_instance_private_ips = [] + storage_cluster_with_data_volume_mapping = {} + storage_cluster_instance_private_dns_ip_map = {} + storage_cluster_desc_instance_ids = [] + storage_cluster_desc_instance_private_ips = [] + storage_cluster_desc_data_volume_mapping = {} + storage_cluster_desc_instance_private_dns_ip_map = {} + storage_cluster_instance_names = [] + storage_subnet_cidr = local.enable_mrot_conf ? local.storage_subnet_cidr : jsonencode("") + compute_subnet_cidr = local.enable_mrot_conf ? local.cluster_subnet_cidr : jsonencode("") + scale_remote_cluster_clustername = local.enable_mrot_conf ? jsonencode(format("%s.%s", var.cluster_prefix, var.cluster_prefix, var.dns_domain_names["storage"])) : jsonencode("") + protocol_cluster_instance_names = [] + client_cluster_instance_names = [] + protocol_cluster_reserved_names = "" + smb = false + nfs = true + object = false + interface = [] + export_ip_pool = [] + filesystem = jsonencode("") + mountpoint = jsonencode("") + protocol_gateway_ip = jsonencode("") + filesets = local.fileset_size_map #{} + afm_cos_bucket_details = [] + afm_config_details = [] + afm_cluster_instance_names = [] + filesystem_mountpoint = var.scale_encryption_type == "key_protect" ? (var.storage_instances[*]["filesystem"] != "" ? var.storage_instances[*]["filesystem"] : jsonencode(var.filesystem_config[0]["filesystem"])) : jsonencode("") + depends_on = [time_sleep.wait_60_seconds] +} + +module "write_storage_scale_cluster_inventory" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/write_scale_inventory" + json_inventory_path = format("%s/storage_cluster_inventory.json", var.scale_ansible_repo_clone_path) + bastion_user = jsonencode(var.bastion_user) + bastion_instance_id = var.bastion_instance_id == null ? jsonencode("None") : jsonencode(var.bastion_instance_id) + bastion_instance_public_ip = var.bastion_fip == null ? jsonencode("None") : jsonencode(var.bastion_fip) + cloud_platform = jsonencode("IBMCloud") + resource_prefix = jsonencode(format("%s.%s", var.cluster_prefix, var.dns_domain_names["storage"])) + vpc_region = jsonencode(local.region) + vpc_availability_zones = var.zones + scale_version = jsonencode(local.scale_version) + compute_cluster_filesystem_mountpoint = jsonencode("None") + storage_cluster_filesystem_mountpoint = jsonencode(var.filesystem_config[0]["mount_point"]) #jsonencode(var.storage_instances[count.index].filesystem) + filesystem_block_size = jsonencode(var.filesystem_config[0]["block_size"]) + compute_cluster_instance_ids = [] + compute_cluster_instance_private_ips = [] + compute_cluster_instance_private_dns_ip_map = {} + compute_cluster_instance_names = [] + storage_cluster_instance_ids = var.storage_type == "persistent" ? concat(local.baremetal_cluster_instance_ids, local.strg_mgmtt_instance_ids, local.tie_breaker_storage_instance_ids) : concat(local.storage_cluster_instance_ids, local.strg_mgmtt_instance_ids, local.tie_breaker_storage_instance_ids) + storage_cluster_instance_private_ips = var.storage_type == "persistent" ? concat(local.baremetal_cluster_instance_private_ips, local.strg_mgmt_instance_private_ips, local.tie_breaker_storage_instance_private_ips) : concat(local.storage_cluster_instance_private_ips, local.strg_mgmt_instance_private_ips, local.tie_breaker_storage_instance_private_ips) + storage_cluster_instance_names = var.storage_type == "persistent" ? concat(local.baremetal_cluster_instance_names, local.strg_mgmt_instance_names, local.tie_breaker_storage_instance_names) : concat(local.storage_cluster_instance_names, local.strg_mgmt_instance_names, local.tie_breaker_storage_instance_names) + storage_cluster_with_data_volume_mapping = local.storage_ips_with_vol_mapping[0] + storage_cluster_instance_private_dns_ip_map = {} + storage_cluster_desc_instance_private_ips = local.strg_tie_breaker_private_ips + storage_cluster_desc_instance_ids = local.strg_tie_breaker_instance_ids + storage_cluster_desc_data_volume_mapping = local.tie_breaker_ips_with_vol_mapping[0] + storage_cluster_desc_instance_private_dns_ip_map = {} + storage_subnet_cidr = local.enable_mrot_conf ? local.storage_subnet_cidr : jsonencode("") + compute_subnet_cidr = local.enable_mrot_conf ? local.cluster_subnet_cidr : local.scale_ces_enabled == true ? local.client_subnet_cidr : jsonencode("") + scale_remote_cluster_clustername = local.enable_mrot_conf ? jsonencode(format("%s.%s", var.cluster_prefix, var.dns_domain_names["compute"])) : jsonencode("") + protocol_cluster_instance_names = local.scale_ces_enabled == true ? local.protocol_cluster_instance_names : [] + client_cluster_instance_names = [] + protocol_cluster_reserved_names = "" + smb = false + nfs = local.scale_ces_enabled == true ? true : false + object = false + interface = [] + export_ip_pool = local.scale_ces_enabled == true ? values(one(module.protocol_reserved_ip[*].instance_name_ip_map)) : [] + filesystem = local.scale_ces_enabled == true ? jsonencode("cesSharedRoot") : jsonencode("") + mountpoint = local.scale_ces_enabled == true ? jsonencode(var.filesystem_config[0]["mount_point"]) : jsonencode("") + protocol_gateway_ip = jsonencode(local.protocol_subnet_gateway_ip) + filesets = local.fileset_size_map + afm_cos_bucket_details = local.enable_afm == true ? local.afm_cos_bucket_details : [] + afm_config_details = local.enable_afm == true ? local.afm_cos_config : [] + afm_cluster_instance_names = local.afm_instance_names + filesystem_mountpoint = var.scale_encryption_type == "key_protect" ? (var.storage_instances[*]["filesystem"] != "" ? var.storage_instances[*]["filesystem"] : jsonencode(var.filesystem_config[0]["filesystem"])) : jsonencode("") + depends_on = [time_sleep.wait_60_seconds] +} + +module "write_client_scale_cluster_inventory" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/write_scale_inventory" + json_inventory_path = format("%s/client_cluster_inventory.json", var.scale_ansible_repo_clone_path) + bastion_user = jsonencode(var.bastion_user) + bastion_instance_id = var.bastion_instance_id == null ? jsonencode("None") : jsonencode(var.bastion_instance_id) + bastion_instance_public_ip = var.bastion_fip == null ? jsonencode("None") : jsonencode(var.bastion_fip) + cloud_platform = jsonencode("") + resource_prefix = jsonencode("") + vpc_region = jsonencode("") + vpc_availability_zones = [] + scale_version = jsonencode("") + filesystem_block_size = jsonencode("") + compute_cluster_filesystem_mountpoint = jsonencode("None") + compute_cluster_instance_ids = [] + compute_cluster_instance_private_ips = [] + compute_cluster_instance_private_dns_ip_map = {} + storage_cluster_filesystem_mountpoint = local.scale_ces_enabled == true ? jsonencode(var.filesystem_config[0]["mount_point"]) : jsonencode("") + storage_cluster_instance_ids = [] + storage_cluster_instance_private_ips = [] + storage_cluster_with_data_volume_mapping = {} + storage_cluster_instance_private_dns_ip_map = {} + storage_cluster_desc_instance_ids = [] + storage_cluster_desc_instance_private_ips = [] + storage_cluster_desc_data_volume_mapping = {} + storage_cluster_desc_instance_private_dns_ip_map = {} + storage_cluster_instance_names = [] + compute_cluster_instance_names = [] + storage_subnet_cidr = jsonencode("") + compute_subnet_cidr = jsonencode("") + scale_remote_cluster_clustername = jsonencode("") + protocol_cluster_instance_names = [] + client_cluster_instance_names = local.scale_ces_enabled == true ? local.client_instance_names : [] + protocol_cluster_reserved_names = local.scale_ces_enabled == true ? format("%s-ces.%s", var.cluster_prefix, var.dns_domain_names["protocol"]) : "" + smb = false + nfs = false + object = false + interface = [] + export_ip_pool = [] + filesystem = jsonencode("") + mountpoint = jsonencode("") + protocol_gateway_ip = jsonencode("") + filesets = local.scale_ces_enabled == true ? local.fileset_size_map : {} + afm_cos_bucket_details = [] + afm_config_details = [] + afm_cluster_instance_names = [] + filesystem_mountpoint = jsonencode("") +} + +module "compute_cluster_configuration" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/common/compute_configuration" + turn_on = (var.create_separate_namespaces == true && local.static_compute_instance_count > 0) ? true : false + bastion_user = jsonencode(var.bastion_user) + write_inventory_complete = module.write_compute_scale_cluster_inventory[0].write_scale_inventory_complete + inventory_format = var.inventory_format + create_scale_cluster = var.create_scale_cluster + clone_path = var.scale_ansible_repo_clone_path + inventory_path = format("%s/compute_cluster_inventory.json", var.scale_ansible_repo_clone_path) + using_packer_image = var.using_packer_image + using_jumphost_connection = var.using_jumphost_connection + using_rest_initialization = var.using_rest_api_remote_mount + compute_cluster_gui_username = var.compute_gui_username + compute_cluster_gui_password = var.compute_gui_password + comp_memory = local.compute_memory + comp_vcpus_count = local.compute_vcpus_count + comp_bandwidth = local.compute_bandwidth + bastion_instance_public_ip = jsonencode(local.bastion_fip) + bastion_ssh_private_key = var.bastion_ssh_private_key + meta_private_key = module.landing_zone_vsi[0].compute_private_key_content + scale_version = local.scale_version + spectrumscale_rpms_path = var.spectrumscale_rpms_path + enable_mrot_conf = local.enable_mrot_conf + enable_ces = false + enable_afm = false + scale_encryption_enabled = var.scale_encryption_enabled + scale_encryption_admin_password = var.scale_encryption_admin_password + scale_encryption_servers = var.scale_encryption_enabled && var.scale_encryption_type == "gklm" ? local.gklm_instance_private_ips : [] + enable_ldap = var.enable_ldap + ldap_basedns = var.ldap_basedns + ldap_server = var.enable_ldap ? local.ldap_instance_private_ips[0] : null + ldap_admin_password = local.ldap_admin_password == "" ? jsonencode(null) : local.ldap_admin_password + enable_key_protect = var.scale_encryption_type + depends_on = [module.write_compute_scale_cluster_inventory] +} + +module "storage_cluster_configuration" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/common/storage_configuration" + turn_on = (var.create_separate_namespaces == true && local.storage_instance_count > 0) ? true : false + bastion_user = jsonencode(var.bastion_user) + write_inventory_complete = module.write_storage_scale_cluster_inventory[0].write_scale_inventory_complete + inventory_format = var.inventory_format + create_scale_cluster = var.create_scale_cluster + clone_path = var.scale_ansible_repo_clone_path + inventory_path = format("%s/storage_cluster_inventory.json", var.scale_ansible_repo_clone_path) + using_packer_image = var.using_packer_image + using_jumphost_connection = var.using_jumphost_connection + using_rest_initialization = true + storage_cluster_gui_username = var.storage_gui_username + storage_cluster_gui_password = var.storage_gui_password + colocate_protocol_instances = var.colocate_protocol_instances + is_colocate_protocol_subset = local.is_colocate_protocol_subset + mgmt_memory = local.management_memory + mgmt_vcpus_count = local.management_vcpus_count + mgmt_bandwidth = local.management_bandwidth + strg_desc_memory = local.storage_desc_memory + strg_desc_vcpus_count = local.storage_desc_vcpus_count + strg_desc_bandwidth = local.storage_desc_bandwidth + strg_memory = local.storage_memory + strg_vcpus_count = local.storage_vcpus_count + strg_bandwidth = local.storage_bandwidth + proto_memory = local.protocol_memory + proto_vcpus_count = local.protocol_vcpus_count + proto_bandwidth = local.protocol_bandwidth + strg_proto_memory = local.storage_protocol_memory + strg_proto_vcpus_count = local.storage_protocol_vcpus_count + strg_proto_bandwidth = local.storage_protocol_bandwidth + afm_memory = local.afm_memory + afm_vcpus_count = local.afm_vcpus_count + afm_bandwidth = local.afm_bandwidth + disk_type = "network-attached" + max_data_replicas = var.filesystem_config[0]["max_data_replica"] + max_metadata_replicas = var.filesystem_config[0]["max_metadata_replica"] + default_metadata_replicas = var.filesystem_config[0]["default_metadata_replica"] + default_data_replicas = var.filesystem_config[0]["default_data_replica"] + bastion_instance_public_ip = jsonencode(local.bastion_fip) + bastion_ssh_private_key = var.bastion_ssh_private_key + meta_private_key = module.landing_zone_vsi[0].storage_private_key_content + scale_version = local.scale_version + spectrumscale_rpms_path = var.spectrumscale_rpms_path + enable_mrot_conf = local.enable_mrot_conf + enable_ces = local.scale_ces_enabled + enable_afm = local.enable_afm + scale_encryption_enabled = var.scale_encryption_enabled + scale_encryption_type = var.scale_encryption_type != null ? var.scale_encryption_type : null + scale_encryption_admin_password = var.scale_encryption_admin_password + scale_encryption_servers = var.scale_encryption_enabled && var.scale_encryption_type == "gklm" ? local.gklm_instance_private_ips : [] + enable_ldap = var.enable_ldap + ldap_basedns = var.ldap_basedns + ldap_server = var.enable_ldap ? local.ldap_instance_private_ips[0] : null + ldap_admin_password = local.ldap_admin_password == "" ? jsonencode(null) : local.ldap_admin_password + ldap_server_cert = local.ldap_server_cert + enable_key_protect = var.scale_encryption_type + depends_on = [module.write_storage_scale_cluster_inventory] +} + +module "client_configuration" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/common//client_configuration" + turn_on = (local.client_instance_count > 0 && var.create_separate_namespaces == true && local.scale_ces_enabled == true) ? true : false + create_scale_cluster = var.create_scale_cluster + storage_cluster_create_complete = module.storage_cluster_configuration[0].storage_cluster_create_complete + clone_path = var.scale_ansible_repo_clone_path + using_jumphost_connection = var.using_jumphost_connection + client_inventory_path = format("%s/client_cluster_inventory.json", var.scale_ansible_repo_clone_path) + bastion_user = jsonencode(var.bastion_user) + bastion_instance_public_ip = jsonencode(local.bastion_fip) + bastion_ssh_private_key = var.bastion_ssh_private_key + client_meta_private_key = module.landing_zone_vsi[0].compute_private_key_content + write_inventory_complete = module.write_storage_scale_cluster_inventory[0].write_scale_inventory_complete + enable_ldap = var.enable_ldap + ldap_basedns = var.ldap_basedns + ldap_server = var.enable_ldap ? jsonencode(local.ldap_instance_private_ips[0]) : jsonencode(null) + ldap_admin_password = local.ldap_admin_password == "" ? jsonencode(null) : local.ldap_admin_password + depends_on = [module.compute_cluster_configuration, module.storage_cluster_configuration] +} + +module "remote_mount_configuration" { + count = var.scheduler == "Scale" && var.enable_deployer == false ? 1 : 0 + source = "./modules/common/remote_mount_configuration" + turn_on = (local.static_compute_instance_count > 0 && local.storage_instance_count > 0 && var.create_separate_namespaces == true) ? true : false + create_scale_cluster = var.create_scale_cluster + bastion_user = jsonencode(var.bastion_user) + clone_path = var.scale_ansible_repo_clone_path + compute_inventory_path = format("%s/compute_cluster_inventory.json", var.scale_ansible_repo_clone_path) + compute_gui_inventory_path = format("%s/compute_cluster_gui_details.json", var.scale_ansible_repo_clone_path) + storage_inventory_path = format("%s/storage_cluster_inventory.json", var.scale_ansible_repo_clone_path) + storage_gui_inventory_path = format("%s/storage_cluster_gui_details.json", var.scale_ansible_repo_clone_path) + compute_cluster_gui_username = var.compute_gui_username + compute_cluster_gui_password = var.compute_gui_password + storage_cluster_gui_username = var.storage_gui_username + storage_cluster_gui_password = var.storage_gui_password + using_jumphost_connection = var.using_jumphost_connection + using_rest_initialization = var.using_rest_api_remote_mount + bastion_instance_public_ip = jsonencode(local.bastion_fip) + bastion_ssh_private_key = var.bastion_ssh_private_key + compute_cluster_create_complete = var.enable_deployer ? false : module.compute_cluster_configuration[0].compute_cluster_create_complete + storage_cluster_create_complete = var.enable_deployer ? false : module.storage_cluster_configuration[0].storage_cluster_create_complete + depends_on = [module.compute_cluster_configuration, module.storage_cluster_configuration] +} + +module "compute_inventory" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/inventory" + scheduler = var.scheduler + hosts = local.compute_hosts + login_host = local.login_host + inventory_path = local.compute_inventory_path + name_mount_path_map = local.fileshare_name_mount_path_map + logs_enable_for_management = var.observability_logs_enable_for_management + monitoring_enable_for_management = var.observability_monitoring_enable + monitoring_enable_for_compute = var.observability_monitoring_on_compute_nodes_enable + cloud_monitoring_access_key = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_access_key : "" + cloud_monitoring_ingestion_url = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_ingestion_url : "" + cloud_monitoring_prws_key = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_prws_key : "" + cloud_monitoring_prws_url = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_prws_url : "" + logs_enable_for_compute = var.observability_logs_enable_for_compute + cloud_logs_ingress_private_endpoint = local.cloud_logs_ingress_private_endpoint + ha_shared_dir = local.ha_shared_dir + prefix = var.cluster_prefix + enable_ldap = var.enable_ldap + ldap_server = local.ldap_server != "null" ? local.ldap_server : join(",", local.ldap_hosts) + playbooks_path = local.playbooks_path + ldap_basedns = var.ldap_basedns + ldap_admin_password = local.ldap_admin_password + ldap_user_name = var.ldap_user_name + ldap_user_password = var.ldap_user_password + ldap_server_cert = local.ldap_server_cert + nfs_shares_map = local.nfs_shares_map + depends_on = [module.write_compute_cluster_inventory] +} + +module "ldap_inventory" { + count = var.enable_deployer == false && var.enable_ldap && local.ldap_server == "null" ? 1 : 0 + source = "./modules/inventory" + prefix = var.cluster_prefix + name_mount_path_map = local.fileshare_name_mount_path_map + enable_ldap = var.enable_ldap + ldap_server = local.ldap_server != "null" ? local.ldap_server : join(",", local.ldap_hosts) + playbooks_path = local.playbooks_path + ldap_basedns = var.ldap_basedns + ldap_admin_password = local.ldap_admin_password + ldap_user_name = var.ldap_user_name + ldap_user_password = var.ldap_user_password + ldap_server_cert = local.ldap_server_cert + depends_on = [module.write_compute_cluster_inventory] +} + +module "mgmt_inventory_hosts" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.mgmt_hosts_ips + inventory_path = local.mgmt_hosts_inventory_path +} + +module "compute_inventory_hosts" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.compute_hosts_ips + inventory_path = local.compute_hosts_inventory_path +} + +module "login_inventory_host" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.login_host_ip + inventory_path = local.login_host_inventory_path +} + +module "bastion_inventory_hosts" { + count = var.enable_deployer == true ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.bastion_hosts_ips + inventory_path = local.bastion_hosts_inventory_path +} + +module "deployer_inventory_hosts" { + count = var.enable_deployer == true ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.deployer_hosts_ips + inventory_path = local.deployer_hosts_inventory_path +} + +module "ldap_inventory_hosts" { + count = var.enable_deployer == false && var.enable_ldap == true ? 1 : 0 + source = "./modules/inventory_hosts" + hosts = local.ldap_hosts + inventory_path = local.ldap_hosts_inventory_path +} + +module "compute_playbook" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/playbook" + scheduler = var.scheduler + bastion_fip = local.bastion_fip + private_key_path = local.compute_private_key_path + inventory_path = local.compute_inventory_path + enable_deployer = var.enable_deployer + ibmcloud_api_key = var.ibmcloud_api_key + observability_provision = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute || var.observability_monitoring_enable ? true : false + cloudlogs_provision = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute ? true : false + observability_playbook_path = local.observability_playbook_path + lsf_mgmt_playbooks_path = local.lsf_mgmt_playbooks_path + enable_ldap = var.enable_ldap + ldap_server = local.ldap_server + playbooks_path = local.playbooks_path + mgmnt_hosts = local.mgmnt_host_entry + comp_hosts = local.comp_host_entry + login_host = local.login_host_entry + deployer_host = local.deployer_host_entry + domain_name = var.dns_domain_names["compute"] + enable_dedicated_host = var.enable_dedicated_host + depends_on = [module.compute_inventory, module.landing_zone_vsi] +} + +################################################### +# Observability Modules +################################################### + +module "cloud_monitoring_instance_creation" { + count = var.enable_deployer == false ? 1 : 0 + source = "./modules/observability_instance" + location = local.region + rg = var.resource_group_ids["service_rg"] + cloud_monitoring_provision = var.observability_monitoring_enable + observability_monitoring_plan = var.observability_monitoring_plan + enable_metrics_routing = var.observability_enable_metrics_routing + enable_platform_logs = var.observability_enable_platform_logs + cluster_prefix = var.cluster_prefix + cloud_monitoring_instance_name = "${var.cluster_prefix}-metrics" + cloud_logs_provision = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute ? true : false + cloud_logs_instance_name = "${var.cluster_prefix}-cloud-logs" + cloud_logs_retention_period = var.observability_logs_retention_period + cloud_logs_as_atracker_target = var.observability_atracker_enable && (var.observability_atracker_target_type == "cloudlogs") ? true : false + cloud_logs_data_bucket = var.cloud_logs_data_bucket + cloud_metrics_data_bucket = var.cloud_metrics_data_bucket + tags = ["lsf", var.cluster_prefix] +} + +module "scc_workload_protection" { + source = "./modules/security/sccwp" + resource_group_name = var.existing_resource_group != "null" ? var.existing_resource_group : "${var.cluster_prefix}-service-rg" + prefix = var.cluster_prefix + region = local.region + sccwp_service_plan = var.sccwp_service_plan + resource_tags = ["lsf", var.cluster_prefix] + enable_deployer = var.enable_deployer + sccwp_enable = var.sccwp_enable + cspm_enabled = var.cspm_enabled + app_config_plan = var.app_config_plan + scc_workload_protection_trusted_profile_name = "${var.cluster_prefix}-wp-tp" +} diff --git a/modules/alb/locals.tf b/modules/alb/locals.tf deleted file mode 100644 index 3088b553..00000000 --- a/modules/alb/locals.tf +++ /dev/null @@ -1,3 +0,0 @@ -locals { - pool_ids = { for idx, pool in ibm_is_lb_pool.alb_backend_pools : pool.name => pool.id } -} diff --git a/modules/alb/main.tf b/modules/alb/main.tf deleted file mode 100644 index d230dc31..00000000 --- a/modules/alb/main.tf +++ /dev/null @@ -1,57 +0,0 @@ -resource "ibm_is_lb" "alb" { - count = var.create_load_balancer ? 1 : 0 - name = format("%s-alb", var.prefix) - resource_group = var.resource_group_id - type = var.alb_type - security_groups = var.security_group_ids - subnets = [var.bastion_subnets[0].id] -} - -resource "ibm_is_lb_pool" "alb_backend_pools" { - count = var.create_load_balancer ? length(var.alb_pools) : 0 - name = format(var.alb_pools[count.index]["name"], var.prefix) - lb = ibm_is_lb.alb[0].id - algorithm = var.alb_pools[count.index]["algorithm"] - protocol = var.alb_pools[count.index]["protocol"] - health_delay = var.alb_pools[count.index]["health_delay"] - health_retries = var.alb_pools[count.index]["health_retries"] - health_timeout = var.alb_pools[count.index]["health_timeout"] - health_type = var.alb_pools[count.index]["health_type"] - health_monitor_url = var.alb_pools[count.index]["health_monitor_url"] - health_monitor_port = var.alb_pools[count.index]["health_monitor_port"] - session_persistence_type = var.alb_pools[count.index]["session_persistence_type"] -} - -resource "ibm_is_lb_listener" "alb_frontend_listener" { - count = var.create_load_balancer ? length(var.alb_pools) : 0 - lb = ibm_is_lb.alb[0].id - port = var.alb_pools[count.index]["lb_pool_listener"]["port"] - protocol = var.alb_pools[count.index]["lb_pool_listener"]["protocol"] - idle_connection_timeout = var.alb_pools[count.index]["lb_pool_listener"]["idle_connection_timeout"] - certificate_instance = var.certificate_instance - default_pool = lookup(local.pool_ids, format(var.alb_pools[count.index]["name"], var.prefix), null) -} - -resource "ibm_is_lb_pool_member" "alb_candidate_members_8443" { - count = var.create_load_balancer ? length(var.vsi_ids) : 0 - lb = ibm_is_lb.alb[0].id - pool = element(split("/", lookup(local.pool_ids, format(var.alb_pools[0]["name"], var.prefix), null)), 1) - port = var.alb_pools[0]["lb_pool_members_port"] - target_id = var.vsi_ids[count.index]["id"] -} - -resource "ibm_is_lb_pool_member" "alb_candidate_members_8444" { - count = var.create_load_balancer ? 1 : 0 - lb = ibm_is_lb.alb[0].id - pool = element(split("/", lookup(local.pool_ids, format(var.alb_pools[1]["name"], var.prefix), null)), 1) - port = var.alb_pools[1]["lb_pool_members_port"] - target_id = var.vsi_ids[0]["id"] -} - -resource "ibm_is_lb_pool_member" "alb_candidate_members_6080" { - count = var.create_load_balancer ? length(var.vsi_ids) : 0 - lb = ibm_is_lb.alb[0].id - pool = element(split("/", lookup(local.pool_ids, format(var.alb_pools[2]["name"], var.prefix), null)), 1) - port = var.alb_pools[2]["lb_pool_members_port"] - target_id = var.vsi_ids[count.index]["id"] -} diff --git a/modules/alb/outputs.tf b/modules/alb/outputs.tf deleted file mode 100644 index aa22195f..00000000 --- a/modules/alb/outputs.tf +++ /dev/null @@ -1,5 +0,0 @@ - -output "alb_hostname" { - description = "ALB hostname" - value = var.create_load_balancer ? ibm_is_lb.alb[0].hostname : "" -} diff --git a/modules/alb/variables.tf b/modules/alb/variables.tf deleted file mode 100644 index b225d06d..00000000 --- a/modules/alb/variables.tf +++ /dev/null @@ -1,136 +0,0 @@ - -variable "resource_group_id" { - description = "String describing resource groups to create or reference" - type = string - default = null -} - -variable "prefix" { - description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." - type = string - - validation { - error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." - condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.prefix)) - } -} - -variable "certificate_instance" { - description = "Certificate instance CRN value. It's the CRN value of a certificate stored in the Secret Manager" - type = string - default = "" -} - -variable "security_group_ids" { - type = list(string) - description = "List of Security group IDs to allow File share access" - default = null -} - -variable "bastion_subnets" { - type = list(object({ - name = string - id = string - zone = string - cidr = string - })) - default = [] - description = "Subnets to launch the bastion host." -} - -variable "create_load_balancer" { - description = "True to create new Load Balancer." - type = bool -} - -variable "vsi_ids" { - type = list( - object({ - id = string, - }) - ) - description = "VSI data" -} - -variable "alb_type" { - description = "ALB type" - type = string - default = "private" -} - -variable "alb_pools" { - description = "List of Load Balancer Pools" - type = list(object({ - name = string - algorithm = string - protocol = string - health_delay = number - health_retries = number - health_timeout = number - health_type = string - health_monitor_url = string - health_monitor_port = number - session_persistence_type = string - lb_pool_members_port = number - lb_pool_listener = object({ - port = number - protocol = string - idle_connection_timeout = number - }) - })) - default = [ - { - name = "%s-alb-pool-8443" - algorithm = "round_robin" - protocol = "https" - health_delay = 5 - health_retries = 5 - health_timeout = 2 - health_type = "https" - health_monitor_url = "/platform/" - health_monitor_port = 8443 - session_persistence_type = "http_cookie" - lb_pool_members_port = 8443 - lb_pool_listener = { - port = 8443 - protocol = "https" - idle_connection_timeout = 50 - } - }, - { - name = "%s-alb-pool-8444" - algorithm = "round_robin" - protocol = "https" - health_delay = 5 - health_retries = 5 - health_timeout = 2 - health_type = "https" - health_monitor_url = "/" - health_monitor_port = 8444 - session_persistence_type = "http_cookie" - lb_pool_members_port = 8444 - lb_pool_listener = { - port = 8444 - protocol = "https" - idle_connection_timeout = 7200 - } - }, - { - name = "%s-alb-pool-6080" - algorithm = "round_robin" - protocol = "https" - health_delay = 5 - health_retries = 5 - health_timeout = 2 - health_type = "https" - health_monitor_url = "/" - health_monitor_port = 6080 - session_persistence_type = "http_cookie" - lb_pool_members_port = 6080 - lb_pool_listener = { - port = 6080 - protocol = "https" - idle_connection_timeout = 50 - } - }] -} diff --git a/modules/alb_api/.gitignore b/modules/alb_api/.gitignore deleted file mode 100644 index 06eab7ae..00000000 --- a/modules/alb_api/.gitignore +++ /dev/null @@ -1 +0,0 @@ -debug_*.txt diff --git a/modules/alb_api/locals.tf b/modules/alb_api/locals.tf deleted file mode 100644 index 54a975c2..00000000 --- a/modules/alb_api/locals.tf +++ /dev/null @@ -1,2 +0,0 @@ -locals { -} diff --git a/modules/alb_api/main.tf b/modules/alb_api/main.tf deleted file mode 100644 index b4041bd2..00000000 --- a/modules/alb_api/main.tf +++ /dev/null @@ -1,36 +0,0 @@ -provider "shell" { - environment = { - } - interpreter = ["/bin/bash", "-c"] - enable_parallelism = false -} - -resource "shell_script" "alb_api" { - count = var.create_load_balancer ? 1 : 0 - lifecycle_commands { - create = "scripts/alb-create.sh" - # read = "scripts/alb-read.sh" - # update = "scripts/alb-update.sh" - delete = "scripts/alb-delete.sh" - } - working_directory = path.module - # interpreter = ["/bin/bash", "-c"] - sensitive_environment = { - ibmcloud_api_key = var.ibmcloud_api_key - } - environment = { - region = var.region - resource_group_id = var.resource_group_id - prefix = var.prefix - bastion_subnet_id = var.bastion_subnets[0].id - certificate_instance = var.certificate_instance - firstip = var.vsi_ips[0] - pool_ips = join(",", var.vsi_ips[*]) - security_group_ids = join(",", var.security_group_ids[*]) - } - triggers = { - # We actually always do delete/create, since "update" is not implemented. - # when_value_changed = var.region - # ... - } -} diff --git a/modules/alb_api/outputs.tf b/modules/alb_api/outputs.tf deleted file mode 100644 index 0d294c09..00000000 --- a/modules/alb_api/outputs.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "alb_hostname" { - description = "ALB hostname" - value = var.create_load_balancer ? shell_script.alb_api[0].output["hostname"] : "" -} diff --git a/modules/alb_api/scripts/alb-create.sh b/modules/alb_api/scripts/alb-create.sh deleted file mode 100755 index fe9a946b..00000000 --- a/modules/alb_api/scripts/alb-create.sh +++ /dev/null @@ -1,261 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -# inputs we assume to get: -# - bastion_subnet_id -# - certificate_instance -# - pool_ips (comma separated) -# - prefix -# - resource_group_id -# - security_group_ids (comma separated) - -debug=true # "true" or "false" - -exec 111>&1 >&2 # use fd 111 later to emit json, other output goes to stderr - -$debug && echo "CREATE $(date +%Y%m%dT%H%M%S.%N)" >>debug_shell_log.txt - -$debug && sort -z /proc/self/environ|tr \\0 \\n >debug_shell_create_env.txt - -# json input from stdin; nothing really expected, input values come from env variables when creating -in="$(cat)" -$debug && echo >debug_shell_create_in.txt "in=<<<$in>>>" - - -# Going to build the complex json request for the ALB creation. -# Many pieces have to be customized, duplicated and finally merged together. - -# pieces -jreq="$(cat <>>$jreq<<<" - -# rough sanity check -if [ "${#jreq}" -lt 100 ]; then - echo "failed to create the JSON request" - exit 1 -fi - -# Step 1. Get a IAM token. - -out="$(curl -X POST 'https://iam.cloud.ibm.com/identity/token' \ - -H 'Content-Type: application/x-www-form-urlencoded' \ - -d "grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey=${ibmcloud_api_key}")" -$debug && echo "$out" -iam_token="$(jq -r '.access_token' <<<"$out")" -$debug && echo "$iam_token" - -# rough sanity check -if [ "${#iam_token}" -lt 100 ]; then - echo "failed to get a IAM token" - exit 1 -fi - -# Step 2. Create the LB. - -out="$(curl -X POST "https://${region}.iaas.cloud.ibm.com/v1/load_balancers?version=2024-04-25&generation=2" \ - -H "Authorization: Bearer $iam_token" \ - -H 'Content-Type: application/json' \ - -H 'accept: application/json' \ - -d "$jreq")" -$debug && echo "$out" -lbid="$(jq -r '.id' <<<"$out")" -$debug && echo "$lbid" - -# rough sanity check -if [ "${#lbid}" -lt 10 ]; then - echo "failed to get a LB id" - exit 1 -fi - -# Other interesting outputs can be collected. - -name="$(jq -r '.name' <<<"$out")" -hostname="$(jq -r '.hostname' <<<"$out")" -crn="$(jq -r '.crn' <<<"$out")" -href="$(jq -r '.href' <<<"$out")" - - -# Step 3. Finally wait for the LB to be really running. - -max_wait_seconds=$((20*60)) -start_at="$(date +%s)" -while true; do - now="$(date +%s)" - if [ "$now" -gt "$((start_at+max_wait_seconds))" ]; then - echo "timeout waiting for LB creation" - exit 1 - fi - - out="$(curl -X GET "https://${region}.iaas.cloud.ibm.com/v1/load_balancers/$lbid?version=2024-04-25&generation=2" \ - -H "Authorization: Bearer $iam_token")" - status="$(jq -r '.provisioning_status' <<<"$out")" - error="$(jq -r '.errors[].code' <<<"$out" )" - $debug && echo "$(date -Is) $status" - $debug && echo "$(date -Is) $error" - - if [ "$status" == "active" ]; then - echo "LB successfully created" - break - elif [ "$status" == "create_pending" ]; then - delay=5 - else # this also handles connection problems - delay=4 - fi - - echo "waiting $delay seconds" - sleep $delay -done -# Note possibile status we can get: -# - create_pending -# - active -# - delete_pending -# Or a specific error if LB is not existent (.errors[].code) -# - load_balancer_not_found - -# All done, prepare final output including interesting values to consume. - -res="$(cat <&111 "$res" - -exit 0 diff --git a/modules/alb_api/scripts/alb-delete.sh b/modules/alb_api/scripts/alb-delete.sh deleted file mode 100755 index b6faabc9..00000000 --- a/modules/alb_api/scripts/alb-delete.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -debug=true # "true" or "false" - -exec 111>&1 >&2 # use fd 111 later to emit json, other output goes to stderr - -$debug && echo "DELETE $(date +%Y%m%dT%H%M%S.%N)" >>debug_shell_log.txt - -$debug && sort -z /proc/self/environ|tr \\0 \\n >debug_shell_delete_env.txt - -# json input from stdin; we get the "id" of the LB here -in=$(cat) -$debug && echo >debug_shell_delete_in.txt "in=<<<$in>>>" - -lbid="$(jq -r .id <<<"$in")" - -# rough sanity check -if [ "${#lbid}" -lt 10 ]; then - echo "failed to get a LB id" - exit 1 -fi - -# Step 1. Get a IAM token. - -out="$(curl -X POST 'https://iam.cloud.ibm.com/identity/token' \ - -H 'Content-Type: application/x-www-form-urlencoded' \ - -d "grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey=${ibmcloud_api_key}")" -$debug && echo "$out" -iam_token="$(jq -r '.access_token' <<<"$out")" -$debug && echo "$iam_token" - -# rough sanity check -if [ "${#iam_token}" -lt 100 ]; then - echo "failed to get a IAM token" - exit 1 -fi - -# Step 2. Delete the LB. - -out="$(curl -X DELETE "https://${region}.iaas.cloud.ibm.com/v1/load_balancers/$lbid?version=2024-04-25&generation=2" \ - -H "Authorization: Bearer $iam_token")" -$debug && echo "$out" - -# Step 3. Finally wait for the LB to really disappear. - -max_wait_seconds=$((15*60)) -start_at="$(date +%s)" -while true; do - now="$(date +%s)" - if [ "$now" -gt "$((start_at+max_wait_seconds))" ]; then - echo "timeout waiting for LB deletion" - exit 1 - fi - - out="$(curl -X GET "https://${region}.iaas.cloud.ibm.com/v1/load_balancers/$lbid?version=2024-04-25&generation=2" \ - -H "Authorization: Bearer $iam_token")" - status="$(jq -r '.provisioning_status' <<<"$out")" - error="$(jq -r '.errors[].code' <<<"$out" )" - $debug && echo "$(date -Is) $status" - $debug && echo "$(date -Is) $error" - - if [ "$error" == "load_balancer_not_found" ]; then - echo "LB successfully deleted" - break - elif [ "$status" == "delete_pending" ]; then - delay=5 - else # this also handles connection problems - delay=4 - fi - - echo "waiting $delay seconds" - sleep $delay -done -# Note possibile status we can get: -# - create_pending -# - active -# - delete_pending -# Or a specific error if LB is not existent (.errors[].code) -# - load_balancer_not_found - -# All done, no output has to be generated. - -exit 0 diff --git a/modules/alb_api/scripts/alb-read.sh b/modules/alb_api/scripts/alb-read.sh deleted file mode 100755 index e087922a..00000000 --- a/modules/alb_api/scripts/alb-read.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Optional, implementation of "resource state sync" functionality. -# We do not need this. diff --git a/modules/alb_api/scripts/alb-update.sh b/modules/alb_api/scripts/alb-update.sh deleted file mode 100755 index a348884a..00000000 --- a/modules/alb_api/scripts/alb-update.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Optional, implementation of "resource change" functionality. -# We do not need this, the delete+create fallback is ok. diff --git a/modules/alb_api/variables.tf b/modules/alb_api/variables.tf deleted file mode 100644 index 7c785181..00000000 --- a/modules/alb_api/variables.tf +++ /dev/null @@ -1,60 +0,0 @@ -variable "ibmcloud_api_key" { - description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." - type = string - sensitive = true - default = null -} - -variable "region" { - description = "The region where the ALB must be instantiated" - type = string -} - -variable "resource_group_id" { - description = "String describing resource groups to create or reference" - type = string - default = null -} - -variable "prefix" { - description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." - type = string - - validation { - error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." - condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.prefix)) - } -} - -variable "certificate_instance" { - description = "Certificate instance CRN value. It's the CRN value of a certificate stored in the Secret Manager" - type = string - default = "" -} - -variable "security_group_ids" { - type = list(string) - description = "List of Security group IDs to allow File share access" - default = null -} - -variable "bastion_subnets" { - type = list(object({ - name = string - id = string - zone = string - cidr = string - })) - default = [] - description = "Subnets to launch the bastion host." -} - -variable "create_load_balancer" { - description = "True to create new Load Balancer." - type = bool -} - -variable "vsi_ips" { - type = list(string) - description = "VSI IPv4 addresses" -} diff --git a/modules/alb_api/version.tf b/modules/alb_api/version.tf deleted file mode 100644 index bc7ab64d..00000000 --- a/modules/alb_api/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - shell = { - source = "scottwinkler/shell" - version = "1.7.10" - } - } -} diff --git a/modules/ansible-roles/roles/cloudlogs/tasks/compute-cloudlogs-configure.yml b/modules/ansible-roles/roles/cloudlogs/tasks/compute-cloudlogs-configure.yml new file mode 100644 index 00000000..3c3ed4f5 --- /dev/null +++ b/modules/ansible-roles/roles/cloudlogs/tasks/compute-cloudlogs-configure.yml @@ -0,0 +1,145 @@ +--- +# Installation Section +- name: Print start message + ansible.builtin.debug: + msg: "Cloud logs agent installation started" + +- name: Download Cloud Logs Agent RPM package checksum + ansible.builtin.get_url: + url: "https://logs-router-agent-install-packages.s3.us.cloud-object-storage.appdomain.cloud/logs-router-agent-rhel8-1.3.1.rpm.sha256" + dest: "/tmp/logs-router-agent-rhel8-1.3.1.rpm.sha256" + +- name: Download Cloud Logs Agent RPM package + ansible.builtin.get_url: + url: "https://logs-router-agent-install-packages.s3.us.cloud-object-storage.appdomain.cloud/logs-router-agent-rhel8-1.3.1.rpm" + dest: "/tmp/logs-router-agent-rhel8-1.3.1.rpm" + +- name: Verify package integrity + ansible.builtin.command: sha256sum -c /tmp/logs-router-agent-rhel8-1.3.1.rpm.sha256 + args: + chdir: /tmp + register: checksum_result + changed_when: false + +- name: Debug checksum result + ansible.builtin.debug: + msg: "{{ checksum_result.stdout_lines }}" + +- name: Install Cloud Logs Agent + ansible.builtin.yum: + name: /tmp/logs-router-agent-rhel8-1.3.1.rpm + state: present + disable_gpg_check: yes + +- name: Verify installation + ansible.builtin.shell: rpm -qa | grep logs-router-agent + register: agent_installed + changed_when: false + +- name: Debug installed package + ansible.builtin.debug: + msg: "{{ agent_installed.stdout }}" + +- name: Download post-config.sh script + ansible.builtin.get_url: + url: "https://logs-router-agent-config.s3.us.cloud-object-storage.appdomain.cloud/post-config.sh" + dest: "/root/post-config.sh" + mode: "0755" + +- name: Print installation completion message + ansible.builtin.debug: + msg: "Cloud logs agent installed successfully" + +# Observability Logs Setup +- name: Check if observability logs for compute are enabled + ansible.builtin.debug: + msg: "Configuring cloud logs for compute since observability logs for compute is enabled" + when: logs_enable_for_compute | bool + +- name: Copy post-config.sh script + ansible.builtin.copy: + src: /root/post-config.sh + dest: /opt/ibm/post-config.sh + mode: '0755' + remote_src: true + when: logs_enable_for_compute | bool + +- name: Create fluent-bit.conf for cloud logs + ansible.builtin.copy: + dest: /etc/fluent-bit/fluent-bit.conf + content: | + [SERVICE] + Flush 1 + Log_Level info + Daemon off + Parsers_File parsers.conf + Plugins_File plugins.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 9001 + Health_Check On + HC_Errors_Count 1 + HC_Retry_Failure_Count 1 + HC_Period 30 + storage.path /fluent-bit/cache + storage.max_chunks_up 192 + storage.metrics On + [INPUT] + Name syslog + Path /tmp/in_syslog + Buffer_Chunk_Size 32000 + Buffer_Max_Size 64000 + Receive_Buffer_Size 512000 + [INPUT] + Name tail + Tag * + Path /opt/ibm/lsflogs/*.log.* + Path_Key file + Exclude_Path /var/log/at/** + DB /opt/ibm/lsflogs/fluent-bit.DB + Buffer_Chunk_Size 32KB + Buffer_Max_Size 256KB + Skip_Long_Lines On + Refresh_Interval 10 + storage.type filesystem + storage.pause_on_chunks_overlimit on + [FILTER] + Name modify + Match * + Add subsystemName compute + Add applicationName lsf + @INCLUDE output-logs-router-agent.conf + when: logs_enable_for_compute | bool + +- name: Fetch API KEY from file + ansible.builtin.slurp: + src: /opt/ibm/temp_file.txt + register: api_key_file + delegate_to: localhost + +- name: Decode API Key + set_fact: + env_api_key: "{{ api_key_file['content'] | b64decode | trim }}" #pragma: allowlist secret + +- name: Delete API Key file + ansible.builtin.file: + path: /opt/ibm/temp_file.txt + state: absent + delegate_to: localhost + +- name: Run post-config.sh script + ansible.builtin.command: > + /opt/ibm/post-config.sh -h {{ cloud_logs_ingress_private_endpoint }} + -p "3443" + -t "/logs/v1/singles" + -a IAMAPIKey + -k {{ env_api_key }} + --send-directly-to-icl + -s true + -i Production + when: logs_enable_for_compute | bool + +- name: Test cloud logs configuration + ansible.builtin.shell: | + echo "INFO Testing IBM Cloud LSF Logs from compute: {{ ansible_hostname }}" >> /opt/ibm/lsflogs/test.log.com + when: logs_enable_for_compute | bool diff --git a/modules/ansible-roles/roles/cloudlogs/tasks/main.yml b/modules/ansible-roles/roles/cloudlogs/tasks/main.yml new file mode 100644 index 00000000..771144eb --- /dev/null +++ b/modules/ansible-roles/roles/cloudlogs/tasks/main.yml @@ -0,0 +1,13 @@ +--- + +# Tasks for Cloud Logs Configuration (for management nodes) +- import_tasks: roles/cloudlogs/tasks/mgmt-cloudlogs-configure.yml + when: + - inventory_hostname in groups['management_nodes'] + - logs_enable_for_management | bool + +# Tasks for Cloud Logs Configuration (for compute nodes) +- import_tasks: roles/cloudlogs/tasks/compute-cloudlogs-configure.yml + when: + - inventory_hostname in groups['compute_nodes'] + - logs_enable_for_compute | bool diff --git a/modules/ansible-roles/roles/cloudlogs/tasks/mgmt-cloudlogs-configure.yml b/modules/ansible-roles/roles/cloudlogs/tasks/mgmt-cloudlogs-configure.yml new file mode 100644 index 00000000..b7e6fe4a --- /dev/null +++ b/modules/ansible-roles/roles/cloudlogs/tasks/mgmt-cloudlogs-configure.yml @@ -0,0 +1,148 @@ +--- +# Installation Section +- name: Print start message + ansible.builtin.debug: + msg: "Cloud logs agent installation started" + +- name: Download Cloud Logs Agent RPM package checksum + ansible.builtin.get_url: + url: "https://logs-router-agent-install-packages.s3.us.cloud-object-storage.appdomain.cloud/logs-router-agent-rhel8-1.3.1.rpm.sha256" + dest: "/tmp/logs-router-agent-rhel8-1.3.1.rpm.sha256" + +- name: Download Cloud Logs Agent RPM package + ansible.builtin.get_url: + url: "https://logs-router-agent-install-packages.s3.us.cloud-object-storage.appdomain.cloud/logs-router-agent-rhel8-1.3.1.rpm" + dest: "/tmp/logs-router-agent-rhel8-1.3.1.rpm" + +- name: Verify package integrity + ansible.builtin.command: sha256sum -c /tmp/logs-router-agent-rhel8-1.3.1.rpm.sha256 + args: + chdir: /tmp + register: checksum_result + changed_when: false + +- name: Debug checksum result + ansible.builtin.debug: + msg: "{{ checksum_result.stdout_lines }}" + +- name: Install Cloud Logs Agent + ansible.builtin.yum: + name: /tmp/logs-router-agent-rhel8-1.3.1.rpm + state: present + disable_gpg_check: yes + +- name: Verify installation + ansible.builtin.shell: rpm -qa | grep logs-router-agent + register: agent_installed + changed_when: false + +- name: Debug installed package + ansible.builtin.debug: + msg: "{{ agent_installed.stdout }}" + +- name: Download post-config.sh script + ansible.builtin.get_url: + url: "https://logs-router-agent-config.s3.us.cloud-object-storage.appdomain.cloud/post-config.sh" + dest: "/root/post-config.sh" + mode: "0755" + +- name: Print installation completion message + ansible.builtin.debug: + msg: "Cloud logs agent installed successfully" + +# Observability Logs Setup +- name: Check if observability logs for management are enabled + ansible.builtin.debug: + msg: "Configuring cloud logs for management since observability logs for management is enabled" + when: logs_enable_for_management | bool + +- name: Copy post-config.sh script + ansible.builtin.copy: + src: /root/post-config.sh + dest: /opt/ibm/post-config.sh + mode: '0755' + remote_src: true + when: logs_enable_for_management | bool + +- name: Create fluent-bit.conf for cloud logs + ansible.builtin.copy: + dest: /etc/fluent-bit/fluent-bit.conf + content: | + [SERVICE] + Flush 1 + Log_Level info + Daemon off + Parsers_File parsers.conf + Plugins_File plugins.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 9001 + Health_Check On + HC_Errors_Count 1 + HC_Retry_Failure_Count 1 + HC_Period 30 + storage.path /fluent-bit/cache + storage.max_chunks_up 192 + storage.metrics On + [INPUT] + Name syslog + Path /tmp/in_syslog + Buffer_Chunk_Size 32000 + Buffer_Max_Size 64000 + Receive_Buffer_Size 512000 + [INPUT] + Name tail + Tag * + Path /opt/ibm/lsflogs/*.log.* + Path_Key file + Exclude_Path /var/log/at/** + DB /opt/ibm/lsflogs/fluent-bit.DB + Buffer_Chunk_Size 32KB + Buffer_Max_Size 256KB + Skip_Long_Lines On + Refresh_Interval 10 + storage.type filesystem + storage.pause_on_chunks_overlimit on + [FILTER] + Name modify + Match * + Add subsystemName management + Add applicationName lsf + @INCLUDE output-logs-router-agent.conf + when: logs_enable_for_management | bool + +- name: Fetch API KEY from file + ansible.builtin.slurp: + src: /opt/ibm/temp_file.txt + register: api_key_file + delegate_to: localhost + +- name: Decode API Key + set_fact: + env_api_key: "{{ api_key_file['content'] | b64decode | trim }}" #pragma: allowlist secret + +- name: Delete API Key file + ansible.builtin.file: + path: /opt/ibm/temp_file.txt + state: absent + delegate_to: localhost + when: + - logs_enable_for_management | bool + - not logs_enable_for_compute | bool + +- name: Run post-config.sh script + ansible.builtin.command: > + /opt/ibm/post-config.sh -h {{ cloud_logs_ingress_private_endpoint }} + -p "3443" + -t "/logs/v1/singles" + -a IAMAPIKey + -k {{ env_api_key }} + --send-directly-to-icl + -s true + -i Production + when: logs_enable_for_management | bool + +- name: Test cloud logs configuration + ansible.builtin.shell: | + echo "INFO Testing IBM Cloud LSF Logs from management: {{ ansible_hostname }}" >> /opt/ibm/lsflogs/test.log.com + when: logs_enable_for_management | bool diff --git a/modules/my_ip/locals.tf b/modules/ansible-roles/roles/cloudlogs/vars/main.yml similarity index 100% rename from modules/my_ip/locals.tf rename to modules/ansible-roles/roles/cloudlogs/vars/main.yml diff --git a/modules/ansible-roles/roles/cloudmonitoring/tasks/compute-cloudmonitoring-configure.yml b/modules/ansible-roles/roles/cloudmonitoring/tasks/compute-cloudmonitoring-configure.yml new file mode 100644 index 00000000..c28dcc39 --- /dev/null +++ b/modules/ansible-roles/roles/cloudmonitoring/tasks/compute-cloudmonitoring-configure.yml @@ -0,0 +1,48 @@ +- name: Check if monitoring is enabled + ansible.builtin.debug: + msg: "Cloud Monitoring is enabled for compute" + when: monitoring_enable_for_compute | bool + +- name: Check if Sysdig Agent binary exists + ansible.builtin.stat: + path: /opt/draios/bin/dragent + register: sysdig_binary + when: monitoring_enable_for_compute | bool + +- name: Install Sysdig Agent if binary is missing + ansible.builtin.shell: | + echo "Installing Sysdig Agent..." + curl -sL https://ibm.biz/install-sysdig-agent | sudo bash -s -- \ + --access_key {{ cloud_monitoring_access_key }} \ + --collector {{ cloud_monitoring_ingestion_url }} \ + --collector_port 6443 \ + --secure true \ + --check_certificate false \ + --additional_conf 'sysdig_capture_enabled: false\nremotefs: true\nfeature:\n mode: monitor_light' + when: + - monitoring_enable_for_compute | bool + - not sysdig_binary.stat.exists + +- name: Configure Sysdig + ansible.builtin.lineinfile: + path: "{{ sysdig_config_file }}" + regexp: "{{ item.regexp }}" + line: "{{ item.line }}" + create: yes + loop: + - { regexp: "==ACCESSKEY==", line: "customerid: {{ cloud_monitoring_access_key }}" } + - { regexp: "==COLLECTOR==", line: "collector: {{ cloud_monitoring_ingestion_url }}" } + - { regexp: "^tags:", line: "tags: type:compute,lsf:true" } + when: monitoring_enable_for_compute | bool + +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: yes + when: monitoring_enable_for_compute | bool + +- name: Ensure Sysdig Agent is enabled and running + ansible.builtin.systemd: + name: dragent + enabled: yes + state: started + when: monitoring_enable_for_compute | bool diff --git a/modules/ansible-roles/roles/cloudmonitoring/tasks/main.yml b/modules/ansible-roles/roles/cloudmonitoring/tasks/main.yml new file mode 100644 index 00000000..1ad4bfa3 --- /dev/null +++ b/modules/ansible-roles/roles/cloudmonitoring/tasks/main.yml @@ -0,0 +1,13 @@ +--- + +# Tasks for Cloud Monitoring Configuration (for management nodes) +- import_tasks: mgmt-cloudmonitoring-configure.yml + when: + - inventory_hostname in groups['management_nodes'] + - monitoring_enable_for_management | bool + +# Tasks for Cloud Monitoring Configuration (for compute nodes) +- import_tasks: compute-cloudmonitoring-configure.yml + when: + - inventory_hostname in groups['compute_nodes'] + - monitoring_enable_for_compute | bool diff --git a/modules/ansible-roles/roles/cloudmonitoring/tasks/mgmt-cloudmonitoring-configure.yml b/modules/ansible-roles/roles/cloudmonitoring/tasks/mgmt-cloudmonitoring-configure.yml new file mode 100644 index 00000000..48f7b26b --- /dev/null +++ b/modules/ansible-roles/roles/cloudmonitoring/tasks/mgmt-cloudmonitoring-configure.yml @@ -0,0 +1,306 @@ +--- +- name: Check if monitoring is enabled + ansible.builtin.debug: + msg: "Cloud Monitoring is enabled for management" + when: monitoring_enable_for_management | bool + +- name: Install required packages + ansible.builtin.yum: + name: + - git + - python3-pip + - wget + state: present + when: monitoring_enable_for_management | bool + +- name: Install Python 3.11 + ansible.builtin.yum: + name: python3.11 + state: present + when: monitoring_enable_for_management | bool + +- name: Ensure Python 3.11 is set as default + ansible.builtin.file: + src: /usr/bin/python3.11 + dest: /usr/bin/python3 + state: link + when: monitoring_enable_for_management | bool + +- name: Ensure pip is installed for Python 3.11 + ansible.builtin.shell: | + /usr/bin/python3.11 -m ensurepip --default-pip + /usr/bin/python3.11 -m pip install --upgrade pip + args: + executable: /bin/bash + when: monitoring_enable_for_management | bool + +- name: Detect correct pip executable #pragma: allowlist secret + ansible.builtin.command: "/usr/bin/python3.11 -m pip --version" + register: pip_check + ignore_errors: true + changed_when: false + +- name: Set correct pip executable path + ansible.builtin.set_fact: + pip_executable: "{{ '/usr/bin/python3.11 -m pip' if 'pip' in pip_check.stdout else '/usr/bin/pip3' }}" + +- name: Check if Sysdig Agent binary exists + ansible.builtin.stat: + path: /opt/draios/bin/dragent + register: sysdig_binary + when: monitoring_enable_for_management | bool + +- name: Install Sysdig Agent if binary is missing + ansible.builtin.shell: | + echo "Installing Sysdig Agent..." + curl -sL https://ibm.biz/install-sysdig-agent | sudo bash -s -- \ + --access_key {{ cloud_monitoring_access_key }} \ + --collector {{ cloud_monitoring_ingestion_url }} \ + --collector_port 6443 \ + --secure true \ + --check_certificate false \ + --additional_conf 'sysdig_capture_enabled: false\nremotefs: true\nfeature:\n mode: monitor_light' + when: + - monitoring_enable_for_management | bool + - not sysdig_binary.stat.exists + +- name: Configure Sysdig + ansible.builtin.lineinfile: + path: "{{ sysdig_config_file }}" + regexp: "{{ item.regexp }}" + line: "{{ item.line }}" + create: yes + loop: + - { + regexp: "==ACCESSKEY==", + line: "customerid: {{ cloud_monitoring_access_key }}", + } + - { + regexp: "==COLLECTOR==", + line: "collector: {{ cloud_monitoring_ingestion_url }}", + } + - { regexp: "^tags:", line: "tags: type:management,lsf:true" } + when: monitoring_enable_for_management | bool + +- name: Ensure Sysdig Agent is enabled and running + ansible.builtin.systemd: + name: dragent + enabled: yes + state: started + when: monitoring_enable_for_management | bool + +- name: Create Prometheus user + ansible.builtin.user: + name: prometheus + shell: /sbin/nologin + comment: "Prometheus user account" + when: monitoring_enable_for_management | bool + +- name: Download and extract Prometheus + ansible.builtin.shell: | + wget https://github.com/prometheus/prometheus/releases/download/v{{ PROMETHEUS_VERSION }}/prometheus-{{ PROMETHEUS_VERSION }}.linux-amd64.tar.gz + tar xzvf prometheus-{{ PROMETHEUS_VERSION }}.linux-amd64.tar.gz + mkdir -p /opt/prometheus/ + cp -av prometheus-{{ PROMETHEUS_VERSION }}.linux-amd64/* /opt/prometheus/ + chown -R prometheus:prometheus /opt/prometheus/ + when: monitoring_enable_for_management | bool + +- name: Check if LSF Prometheus Exporter is already set up + ansible.builtin.stat: + path: /etc/systemd/system/lsf_prometheus_exporter.service + register: exporter_installed + when: monitoring_enable_for_management | bool + +- name: Ensure clean installation of Python dependencies + ansible.builtin.shell: | + /usr/bin/python3.11 -m pip uninstall -y prometheus_client || true + /usr/bin/python3.11 -m pip install --no-cache-dir --force-reinstall prometheus_client lsf_prometheus_exporter + args: + executable: /bin/bash + when: monitoring_enable_for_management | bool + +- name: Install LSF Prometheus Exporter using pip + ansible.builtin.pip: + name: lsf_prometheus_exporter + executable: /usr/local/bin/pip3.11 + extra_args: --no-cache-dir --force-reinstall + when: + - monitoring_enable_for_management | bool + - not exporter_installed.stat.exists + +- name: Fix LSF Prometheus Exporter permissions if already installed + ansible.builtin.shell: | + echo "Exporter already present. Fixing permissions..."; + find /usr/local/lib/python3.11/site-packages/lsf_prometheus_exporter* -type d -exec chmod o+rx {} + + find /usr/local/lib/python3.11/site-packages/lsf_prometheus_exporter* -type f -exec chmod o+r {} + + args: + executable: /bin/bash + when: + - monitoring_enable_for_management | bool + - exporter_installed.stat.exists + +- name: Configure Prometheus + ansible.builtin.copy: + content: | + global: + scrape_interval: 60s + evaluation_interval: 15s + + scrape_configs: + - job_name: "lsf_prometheus_exporter" + static_configs: + - targets: ["localhost:9405"] + + remote_write: + - url: "{{ cloud_monitoring_prws_url }}" + authorization: + credentials: "{{ cloud_monitoring_prws_key }}" # pragma: allowlist secret + dest: "{{ prometheus_config_file }}" + when: monitoring_enable_for_management | bool + +- name: Create start script for LSF Prometheus Exporter + ansible.builtin.copy: + dest: /opt/ibm/lsfsuite/lsf/start_lsf_prometheus_exporter.sh + content: | + #!/bin/bash + exec >> /var/log/lsf_prometheus_exporter.log 2>&1 + source /opt/ibm/lsfsuite/lsf/conf/profile.lsf + exec /usr/bin/python3 -m lsf_prometheus_exporter + mode: "0755" + owner: lsfadmin + group: lsfadmin + when: + - monitoring_enable_for_management | bool + +- name: Create systemd service for Prometheus Agent + ansible.builtin.copy: + dest: /etc/systemd/system/prometheus.service + mode: "0644" + content: | + [Unit] + Description=Prometheus Agent + After=network-online.target + + [Service] + Type=simple + ExecStart=/opt/prometheus/prometheus \ + --config.file=/opt/prometheus/prometheus.yml \ + --enable-feature=agent \ + --storage.agent.path="/opt/prometheus/data-agent" + TimeoutSec=0 + RemainAfterExit=yes + GuessMainPID=no + Restart=on-failure + RestartSec=10 + User=prometheus + Group=prometheus + + [Install] + WantedBy=multi-user.target + when: + - monitoring_enable_for_management | bool + - not exporter_installed.stat.exists + +- name: Enable LSF scheduler metrics for Prometheus + ansible.builtin.lineinfile: + path: "{{ LSF_CONF }}/lsbatch/{{ prefix }}/configdir/lsb.params" + insertbefore: "^End Parameters" + line: "SCHED_METRIC_ENABLE=Y" + state: present + backup: yes + when: + - monitoring_enable_for_management | bool + +- name: Restart lsfd service to apply scheduler metric changes + ansible.builtin.systemd: + name: lsfd + state: restarted + enabled: yes + when: + - monitoring_enable_for_management | bool + +- name: Reload systemd and start Prometheus Agent + ansible.builtin.systemd: + daemon_reload: yes + name: prometheus + enabled: yes + state: restarted + when: monitoring_enable_for_management | bool + +- name: Create systemd service for Prometheus Agent + ansible.builtin.copy: + dest: /etc/systemd/system/prometheus.service + content: | + [Unit] + Description=Prometheus Agent + After=network-online.target + + [Service] + Type=simple + ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --enable-feature=agent --storage.agent.path="/opt/prometheus/data-agent" + TimeoutSec=0 + RemainAfterExit=yes + GuessMainPID=no + RestartSec=10 + User=prometheus + Group=prometheus + + [Install] + WantedBy=multi-user.target + mode: "0644" + when: monitoring_enable_for_management | bool + +- name: Reload systemd and start Prometheus Agent + ansible.builtin.systemd: + daemon_reload: yes + name: prometheus + enabled: yes + state: restarted + when: monitoring_enable_for_management | bool + +- name: Ensure start script has correct permissions + ansible.builtin.file: + path: /opt/ibm/lsfsuite/lsf/start_lsf_prometheus_exporter.sh + mode: "0755" + owner: lsfadmin + group: lsfadmin + when: monitoring_enable_for_management | bool + +- name: Create systemd service for LSF Prometheus Exporter + ansible.builtin.copy: + dest: /etc/systemd/system/lsf_prometheus_exporter.service + content: | + [Unit] + Description=IBM LSF Prometheus Exporter Service + After=network-online.target + + [Service] + Type=simple + ExecStart=/opt/ibm/lsfsuite/lsf/start_lsf_prometheus_exporter.sh + TimeoutSec=0 + RemainAfterExit=yes + GuessMainPID=no + RestartSec=10 + User=lsfadmin + Group=lsfadmin + Restart=always + + [Install] + WantedBy=multi-user.target + mode: "0644" + when: monitoring_enable_for_management | bool + +- name: Reload systemd and start LSF Prometheus Exporter + ansible.builtin.systemd: + daemon_reload: yes + name: lsf_prometheus_exporter + enabled: yes + state: restarted + when: monitoring_enable_for_management | bool + +- name: Enable and restart Sysdig Agent + ansible.builtin.systemd: + name: dragent + enabled: yes + state: restarted + when: monitoring_enable_for_management | bool diff --git a/modules/ansible-roles/roles/cloudmonitoring/vars/main.yml b/modules/ansible-roles/roles/cloudmonitoring/vars/main.yml new file mode 100644 index 00000000..d32d3330 --- /dev/null +++ b/modules/ansible-roles/roles/cloudmonitoring/vars/main.yml @@ -0,0 +1,4 @@ +sysdig_config_file: "/opt/draios/etc/dragent.yaml" +prometheus_config_file: "/opt/prometheus/prometheus.yml" +LSF_CONF: "/opt/ibm/lsfsuite/lsf/conf" +PROMETHEUS_VERSION: "2.51.1" diff --git a/modules/ansible-roles/roles/ldap_client_config/handlers/main.yml b/modules/ansible-roles/roles/ldap_client_config/handlers/main.yml new file mode 100644 index 00000000..1310d994 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/handlers/main.yml @@ -0,0 +1,8 @@ +--- + +# Task: Restart and enable SSH service +- name: Restart SSH Service + ansible.builtin.service: + name: sshd + state: restarted + enabled: true diff --git a/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_prerequisites.yml b/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_prerequisites.yml new file mode 100644 index 00000000..56c33b9c --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_prerequisites.yml @@ -0,0 +1,22 @@ +--- + +# Install required LDAP client packages for integration + +- name: LDAP | Determine RHEL major version for package compatibility + ansible.builtin.shell: "grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}'" + register: rhel_version + changed_when: false + +- name: LDAP | Install required OpenLDAP and SSSD packages on RHEL 8/9 + ansible.builtin.dnf: + name: + - libnsl + - libnsl2 + - openldap-clients + - sssd + - sssd-ldap + - oddjob-mkhomedir + - openssl-perl + - authselect + state: present + when: rhel_version.stdout in ["8", "9"] diff --git a/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_user_integration.yml b/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_user_integration.yml new file mode 100644 index 00000000..06f3416c --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/tasks/ldap_user_integration.yml @@ -0,0 +1,101 @@ +--- + +# Detect OS version (RHEL 8/9 check) +- name: LDAP | Detect RHEL major version + ansible.builtin.shell: "grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}'" + register: rhel_version + changed_when: false + +# Proceed with LDAP client configuration only on supported RHEL versions +- block: + + # Copy LDAP certificate if not already present + - name: LDAP | Check if local ldap_cacert.pem exists + ansible.builtin.stat: + path: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + register: ldap_cert_stat + + - name: LDAP | Copy ldap_cacert.pem to remote OpenLDAP certs directory + ansible.builtin.copy: + src: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + dest: /etc/openldap/certs/ldap_cacert.pem + owner: root + group: root + mode: '0600' + when: not ldap_cert_stat.stat.exists + register: ldap_cert_result + + # Configure LDAP client + - name: LDAP | Update ldap.conf with server and certificate details + ansible.builtin.blockinfile: + path: /etc/openldap/ldap.conf + block: | + BASE dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} + URI ldap://{{ LDAP_SERVER_IP }}/ + TLS_CACERT /etc/openldap/certs/ldap_cacert.pem + TLS_CACERTDIR /etc/openldap/certs + create: yes + + - name: LDAP | Rehash OpenLDAP certs directory + ansible.builtin.command: + cmd: openssl rehash /etc/openldap/certs + + # Configure SSSD + - name: LDAP | Configure authselect to use SSSD with mkhomedir + ansible.builtin.command: + cmd: authselect select sssd with-mkhomedir --force + + - name: LDAP | Remove existing SSSD config (if present) + ansible.builtin.file: + path: /etc/sssd/sssd.conf + state: absent + + - name: LDAP | Generate new SSSD configuration file from template + ansible.builtin.template: + src: sssd.conf.j2 + dest: /etc/sssd/sssd.conf + mode: '0600' + owner: root + group: root + + - name: LDAP | Restart and enable SSSD and oddjobd services + ansible.builtin.systemd: + name: "{{ item }}" + state: restarted + enabled: yes + loop: + - sssd + - oddjobd + + # SSH Configuration + - name: LDAP | Enable SSH password authentication + ansible.builtin.command: + cmd: sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config + notify: Restart SSH Service + + # Upload certificate to shared directory + - name: LDAP | Ensure shared OpenLDAP certificate directory exists + ansible.builtin.file: + path: "{{ ha_shared_dir }}/openldap" + state: directory + mode: '0755' + run_once: true + + - name: LDAP | Upload ldap_cacert.pem to shared directory + ansible.builtin.copy: + src: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + dest: "{{ ha_shared_dir }}/openldap/ldap_cacert.pem" + mode: '0644' + run_once: true + + when: rhel_version.stdout in ["8", "9"] + + rescue: + - name: LDAP | Log error if LDAP client configuration fails + ansible.builtin.debug: + msg: "LDAP client configuration block failed. Check previous task results." + + always: + - name: LDAP | Always ensure permissions are reset or log final status + ansible.builtin.debug: + msg: "LDAP configuration block completed (success or failure)." diff --git a/modules/ansible-roles/roles/ldap_client_config/tasks/main.yml b/modules/ansible-roles/roles/ldap_client_config/tasks/main.yml new file mode 100644 index 00000000..6b1660ba --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/tasks/main.yml @@ -0,0 +1,13 @@ +--- + +# Upload or update the LDAP server certificate if needed +- name: LDAP Client Config | Upload LDAP server certificate + import_tasks: upload_ldap_cert.yml + +# Install prerequisite packages required for LDAP integration +- name: LDAP Client Config | Install LDAP prerequisite packages + import_tasks: ldap_prerequisites.yml + +# Configure the system as an LDAP client +- name: LDAP Client Config | Configure LDAP client integration + import_tasks: ldap_user_integration.yml diff --git a/modules/ansible-roles/roles/ldap_client_config/tasks/upload_ldap_cert.yml b/modules/ansible-roles/roles/ldap_client_config/tasks/upload_ldap_cert.yml new file mode 100644 index 00000000..f2f5b9c1 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/tasks/upload_ldap_cert.yml @@ -0,0 +1,29 @@ +--- + +# Copy the existing LDAP server certificate to the Ansible controller + +- name: LDAP | Ensure the local LDAP certificate directory exists + file: + path: "{{ LDAP_CERT_FILES_DIR }}" + state: directory + mode: '0755' + delegate_to: localhost + run_once: true + when: LDAP_SERVER_CERT is defined and (LDAP_SERVER_CERT | length > 10) + +- name: LDAP | Check if ldap_cacert.pem already exists locally + stat: + path: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + register: ldap_cert_status + delegate_to: localhost + run_once: true + when: LDAP_SERVER_CERT is defined and (LDAP_SERVER_CERT | length > 10) + +- name: LDAP | Copy ldap_cacert.pem to the local filesystem if not present + copy: + content: "{{ LDAP_SERVER_CERT | replace('\\n', '\n') }}" + dest: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + mode: '0644' + delegate_to: localhost + run_once: true + when: LDAP_SERVER_CERT is defined and (LDAP_SERVER_CERT | length > 10) and not ldap_cert_status.stat.exists diff --git a/modules/ansible-roles/roles/ldap_client_config/templates/sssd.conf.j2 b/modules/ansible-roles/roles/ldap_client_config/templates/sssd.conf.j2 new file mode 100644 index 00000000..2f7abb4d --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/templates/sssd.conf.j2 @@ -0,0 +1,21 @@ +[sssd] +config_file_version = 2 +services = nss, pam, autofs +domains = default + +[nss] +homedir_substring = /home + +[pam] + +[domain/default] +id_provider = ldap +autofs_provider = ldap +auth_provider = ldap +chpass_provider = ldap +ldap_uri = ldap://{{ LDAP_SERVER_IP }} +ldap_search_base = dc={{ BASE_DN | regex_replace('\\..*', '') }},dc={{ BASE_DN | regex_replace('^[^.]+\\.', '') }} +ldap_id_use_start_tls = True +ldap_tls_cacertdir = /etc/openldap/certs +cache_credentials = True +ldap_tls_reqcert = allow diff --git a/modules/ansible-roles/roles/ldap_client_config/vars/main.yml b/modules/ansible-roles/roles/ldap_client_config/vars/main.yml new file mode 100644 index 00000000..c555ee71 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_client_config/vars/main.yml @@ -0,0 +1,7 @@ +# LDAP Configuration Variables + +LDAP_ADMIN_PASSWORD: "{{ ldap_admin_password }}" #pragma: allowlist secret +BASE_DN: "{{ ldap_basedns }}" +LDAP_SERVER_IP: "{{ ldap_server }}" +LDAP_SERVER_CERT: "{{ ldap_server_cert }}" +LDAP_CERT_FILES_DIR: "/opt/ibm/terraform-ibm-hpc/modules/ansible-roles/ldap_key" diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/cleanup_secrets.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/cleanup_secrets.yml new file mode 100644 index 00000000..ea07de15 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/cleanup_secrets.yml @@ -0,0 +1,14 @@ +--- +# Cleaning the secret files which are stored locally. + +- name: LDAP_Server_Setup | Cleaning the LDIF Files + file: + path: "{{ LDAP_DIR }}" + state: absent + recurse: false + register: cleanup_output + run_once: true + +- debug: + var: cleanup_output.stdout_lines + when: cleanup_output is defined diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/get_ldap_certs.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/get_ldap_certs.yml new file mode 100644 index 00000000..ec24902f --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/get_ldap_certs.yml @@ -0,0 +1,19 @@ +--- +# Getting OpenLDAP SSL Certificate + +- name: LDAP_Server_Setup | Check if CA certificate exists on the remote server + stat: + path: /usr/local/share/ca-certificates/ldap_cacert.pem + register: remote_cert_status + +- name: LDAP_Server_Setup | Check if CA certificate already exists locally + stat: + path: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + register: ldap_cert_status + +- name: LDAP_Server_Setup | Download CA certificate from remote to local if not present locally + fetch: + src: /usr/local/share/ca-certificates/ldap_cacert.pem + dest: "{{ LDAP_CERT_FILES_DIR }}/ldap_cacert.pem" + flat: true + when: remote_cert_status.stat.exists and not ldap_cert_status.stat.exists diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_base_ou.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_base_ou.yml new file mode 100644 index 00000000..3099214c --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_base_ou.yml @@ -0,0 +1,54 @@ +--- +# Creating Base OU for the LDAP Server + +- name: LDAP_Server_Setup | Server People OU File + shell: | + echo "dn: ou=People,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} + objectClass: organizationalUnit + ou: People" > "{{ LDAP_DIR }}/oupeople.ldif" + register: people_ou_file + +- name: LDAP_Server_Setup | Server Groups OU File + shell: | + echo "dn: ou=Groups,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} + objectClass: organizationalUnit + ou: Groups" > "{{ LDAP_DIR }}/ougroups.ldif" + register: groups_ou_file + +- name: LDAP_Server_Setup | Groups OU Check + ansible.builtin.shell: | + ldapsearch -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -b "ou=Groups,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }}" "objectClass=organizationalUnit" > /dev/null 2>&1 + if [ $? -eq 32 ]; then + echo "GroupsOUNotFound" + else + echo "GroupsOUFound" + fi + register: ldap_groups_ou_search + +- name: LDAP_Server_Setup | Groups OU Create + ansible.builtin.command: ldapadd -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -f {{ LDAP_DIR }}/ougroups.ldif + when: ldap_groups_ou_search.stdout == "GroupsOUNotFound" + +- name: LDAP_Server_Setup | Groups OU Exist + debug: + msg: "LDAP OU 'Groups' already exists. Skipping." + when: ldap_groups_ou_search.stdout == "GroupsOUFound" + +- name: LDAP_Server_Setup | People OU Check + ansible.builtin.shell: | + ldapsearch -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -b "ou=People,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }}" "objectClass=organizationalUnit" > /dev/null 2>&1 + if [ $? -eq 32 ]; then + echo "PeopleOUNotFound" + else + echo "PeopleOUFound" + fi + register: ldap_people_ou_search + +- name: LDAP_Server_Setup | People OU Create + ansible.builtin.command: ldapadd -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -f {{ LDAP_DIR }}/oupeople.ldif + when: ldap_people_ou_search.stdout == "PeopleOUNotFound" + +- name: LDAP_Server_Setup | People OU Exist + debug: + msg: "LDAP OU 'People' already exists. Skipping." + when: ldap_people_ou_search.stdout == "PeopleOUFound" diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_env.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_env.yml new file mode 100644 index 00000000..9e75df23 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_env.yml @@ -0,0 +1,205 @@ +--- + +# LDAP Server Setup +- name: LDAP_Server_Setup | LDAP Directory | Creation + file: + path: "{{ LDAP_DIR }}" + state: directory + register: create_dir_output + run_once: true + +- name: LDAP_Server_Setup | Update apt package index + apt: + update_cache: yes + +- name: LDAP_Server_Setup | Set basedomain and rootdomain + set_fact: + basedomain: "{{ BASE_DN.split('.')[0] }}" + rootdomain: "{{ BASE_DN.split('.')[1] }}" + +- name: LDAP_Server_Setup | Debug basedomain and rootdomain values + debug: + msg: + - "Basedomain: {{ basedomain }}" + - "Rootdomain: {{ rootdomain }}" + +- name: LDAP_Server_Setup | Install required packages + apt: + name: + - nfs-common + - gnutls-bin + - ssl-cert + - debconf-utils + state: present + force_apt_get: yes + +- name: LDAP_Server_Setup | Install the openldap and required packages for ubuntu + ansible.builtin.apt: + name: "{{ OPENLDAP_SERVER_PKGS }}" + state: present + update_cache: true + when: ansible_os_family == 'Debian' + +- name: LDAP_Server_Setup | Reconfigure slapd + shell: | + echo "slapd slapd/root_password password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/root_password_again password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/internal/adminpw password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/internal/generated_adminpw password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/password1 password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/password2 password {{ LDAP_ADMIN_PASSWORD }}" | debconf-set-selections + echo "slapd slapd/domain string {{ BASE_DN }}" | debconf-set-selections + echo "slapd shared/organization string {{ LDAP_GROUP }}" | debconf-set-selections + echo "slapd slapd/purge_database boolean false" | debconf-set-selections + echo "slapd slapd/move_old_database boolean true" | debconf-set-selections + echo "slapd slapd/no_configuration boolean false" | debconf-set-selections + dpkg-reconfigure -f noninteractive slapd + +- name: LDAP_Server_Setup | Set BASE in ldap.conf + lineinfile: + path: /etc/ldap/ldap.conf + line: "BASE dc={{ basedomain }},dc={{ rootdomain }}" + create: yes + +- name: LDAP_Server_Setup | Set URI in ldap.conf + lineinfile: + path: /etc/ldap/ldap.conf + line: "URI ldap://localhost" + create: yes + +- name: LDAP_Server_Setup | Restart slapd service + service: + name: slapd + state: restarted + +- name: LDAP_Server_Setup | Check slapd service status + command: systemctl status slapd + register: slapd_status + +- name: LDAP_Server_Setup | Display slapd status + debug: + var: slapd_status + +- name: LDAP_Server_Setup | Generate private key for CA + command: + cmd: > + certtool --generate-privkey --sec-param High --outfile /etc/ssl/private/ldap_cakey.pem + args: + creates: /etc/ssl/private/ldap_cakey.pem + +- name: LDAP_Server_Setup | Create CA template file + copy: + dest: /etc/ssl/ca.info + content: | + cn = {{ LDAP_GROUP }} + ca + cert_signing_key + expiration_days = 3650 + +- name: LDAP_Server_Setup | Generate self-signed CA certificate + command: + cmd: > + certtool --generate-self-signed + --load-privkey /etc/ssl/private/ldap_cakey.pem + --template /etc/ssl/ca.info + --outfile /usr/local/share/ca-certificates/ldap_cacert.pem + args: + creates: /usr/local/share/ca-certificates/ldap_cacert.pem + +- name: LDAP_Server_Setup | Update CA certificates + command: update-ca-certificates + +- name: LDAP_Server_Setup | Copy CA certificate to /etc/ssl/certs + copy: + src: /usr/local/share/ca-certificates/ldap_cacert.pem + dest: /etc/ssl/certs/ldap_cacert.pem + remote_src: yes + +- name: LDAP_Server_Setup | Generate private key for LDAP server + command: + cmd: > + certtool --generate-privkey --sec-param High --outfile /etc/ssl/private/ldapserver_slapd_key.pem + args: + creates: /etc/ssl/private/ldapserver_slapd_key.pem + +- name: LDAP_Server_Setup | Create LDAP server certificate template + copy: + dest: /etc/ssl/ldapserver.info + content: | + organization = {{ LDAP_GROUP }} + cn = localhost + tls_www_server + encryption_key + signing_key + expiration_days = 3650 + +- name: LDAP_Server_Setup | Generate certificate for LDAP server signed by CA + command: + cmd: > + certtool --generate-certificate + --load-privkey /etc/ssl/private/ldapserver_slapd_key.pem + --load-ca-certificate /etc/ssl/certs/ldap_cacert.pem + --load-ca-privkey /etc/ssl/private/ldap_cakey.pem + --template /etc/ssl/ldapserver.info + --outfile /etc/ssl/certs/ldapserver_slapd_cert.pem + args: + creates: /etc/ssl/certs/ldapserver_slapd_cert.pem + +- name: LDAP_Server_Setup | Set proper permissions for LDAP server private key + file: + path: /etc/ssl/private/ldapserver_slapd_key.pem + group: openldap + mode: "0640" + state: file + +- name: LDAP_Server_Setup | Add openldap to ssl-cert group + command: gpasswd -a openldap ssl-cert + +- name: LDAP_Server_Setup | Pause for 2 seconds + command: sleep 2 + +- name: LDAP_Server_Setup | Restart slapd service + service: + name: slapd + state: restarted + +- name: LDAP_Server_Setup | Create LDIF file for TLS configuration + copy: + dest: /etc/ssl/certinfo.ldif + content: | + dn: cn=config + add: olcTLSCACertificateFile + olcTLSCACertificateFile: /etc/ssl/certs/ldap_cacert.pem + - + add: olcTLSCertificateFile + olcTLSCertificateFile: /etc/ssl/certs/ldapserver_slapd_cert.pem + - + add: olcTLSCertificateKeyFile + olcTLSCertificateKeyFile: /etc/ssl/private/ldapserver_slapd_key.pem + +- name: LDAP_Server_Setup | Apply TLS configuration using ldapmodify + command: + cmd: ldapmodify -Y EXTERNAL -H ldapi:/// -f /etc/ssl/certinfo.ldif + +- name: LDAP_Server_Setup | Pause for 2 seconds + command: sleep 2 + +- name: LDAP_Server_Setup | Update slapd to listen on ldaps:// + replace: + path: /etc/default/slapd + regexp: 'SLAPD_SERVICES="ldap:/// ldapi:///"' + replace: 'SLAPD_SERVICES="ldap:/// ldapi:/// ldaps:///"' + +- name: LDAP_Server_Setup | Update ldap.conf with TLS configuration + copy: + dest: /etc/ldap/ldap.conf + content: | + BASE dc={{ basedomain }},dc={{ rootdomain }} + URI ldap://localhost + TLS_CACERT /etc/ssl/certs/ldap_cacert.pem + TLS_REQCERT allow + +- name: LDAP_Server_Setup | Restart slapd service after configuration + service: + name: slapd + state: restarted diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_groups.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_groups.yml new file mode 100644 index 00000000..b98c71a5 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_groups.yml @@ -0,0 +1,28 @@ +--- +# Creating LDAP Group on the LDAP Server + +- name: LDAP_Server_Setup | Group File + shell: | + echo "dn: cn={{ LDAP_GROUP }},ou=Groups,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} + objectClass: posixGroup + cn: {{ LDAP_GROUP }} + gidNumber: 5000" > "{{ LDAP_DIR }}/group.ldif" + +- name: LDAP_Server_Setup | Check Group Existence + ansible.builtin.shell: | + ldap_group_search_result=$(ldapsearch -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -b "ou=groups,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }}" "(cn={{ LDAP_GROUP }})" 2>&1) + if echo "$ldap_group_search_result" | grep -q "dn: cn={{ LDAP_GROUP }},"; then + echo "GroupFound" + else + echo "GroupNotFound" + fi + register: ldap_group_search + +- name: LDAP_Server_Setup | Group Created + ansible.builtin.command: ldapadd -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -f {{ LDAP_DIR }}/group.ldif + when: ldap_group_search.stdout == "GroupNotFound" + +- name: LDAP_Server_Setup | Group Exist + debug: + msg: "LDAP Group '{{ LDAP_GROUP }}' already exists. Skipping." + when: ldap_group_search.stdout == "GroupFound" diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_installation_status.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_installation_status.yml new file mode 100644 index 00000000..2ed8c8e0 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_installation_status.yml @@ -0,0 +1,10 @@ +--- +- name: LDAP_Server_Setup | Check if LDAP configuration is already applied + stat: + path: /etc/ldap/.ldap_configured + register: ldap_config_status + +- name: LDAP_Server_Setup | Skip configuration if already applied + debug: + msg: "LDAP configuration already applied, skipping." + when: ldap_config_status.stat.exists diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_users.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_users.yml new file mode 100644 index 00000000..2bc41590 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/ldap_users.yml @@ -0,0 +1,48 @@ +--- +# Creating LDAP User on the LDAP Server + +- name: LDAP_Server_Setup | Generate LDAP Password Hash + ansible.builtin.command: slappasswd -s "{{ LDAP_USER_PASSWORD }}" + register: ldap_hashed_password + +- name: LDAP_Server_Setup | User File + shell: | + echo "dn: uid={{ LDAP_USER }},ou=People,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} + objectClass: inetOrgPerson + objectClass: posixAccount + objectClass: shadowAccount + uid: {{ LDAP_USER }} + sn: {{ LDAP_USER }} + givenName: {{ LDAP_USER }} + cn: {{ LDAP_USER }} + displayName: {{ LDAP_USER }} + uidNumber: 10000 + gidNumber: 5000 + userPassword: {{ ldap_hashed_password.stdout }} + gecos: {{ LDAP_USER }} + loginShell: /bin/bash + homeDirectory: /home/{{ LDAP_USER }}" > "{{ LDAP_DIR }}/users.ldif" + +- name: LDAP_Server_Setup | User Check + ansible.builtin.shell: | + ldap_user_search_result=$(ldapsearch -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -b "ou=people,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }}" uid cn 2>&1) + if echo "$ldap_user_search_result" | grep -q "dn: uid={{ LDAP_USER }},"; then + echo "UserFound" + else + echo "UserNotFound" + fi + register: ldap_user_search + +- name: LDAP_Server_Setup | User Created + ansible.builtin.command: ldapadd -x -D cn=admin,dc={{ BASE_DN.split('.')[0] }},dc={{ BASE_DN.split('.')[1] }} -w {{ LDAP_ADMIN_PASSWORD }} -f {{ LDAP_DIR }}/users.ldif + when: ldap_user_search.stdout == "UserNotFound" + +- name: LDAP_Server_Setup | User Exist + debug: + msg: "LDAP User '{{ LDAP_USER }}' already exists. Skipping." + when: ldap_user_search.stdout == "UserFound" + +- name: LDAP_Server_Setup | Mark LDAP configuration as completed + file: + path: /etc/ldap/.ldap_configured + state: touch diff --git a/modules/ansible-roles/roles/ldap_server_prepare/tasks/main.yml b/modules/ansible-roles/roles/ldap_server_prepare/tasks/main.yml new file mode 100644 index 00000000..65fd54aa --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/tasks/main.yml @@ -0,0 +1,28 @@ +--- + +# Integration LDAP with LSF. +# Below are the LDAP Server configuration tasks to add OU, Groups and Users. + +# Check if LDAP configuration is already applied +- import_tasks: ldap_installation_status.yml + +# Conditionally execute tasks if LDAP is not configured +- block: + # Import the 'ldap_env.yml' task for setting the env to store the LDAP configuration files. + - import_tasks: ldap_env.yml + + # Import the 'get_ldap_certs.yml' task for getting the SSL certificate. + - import_tasks: get_ldap_certs.yml + + # Import the 'ldap_base_ou.yml' task for adding OU to LDAP. + - import_tasks: ldap_base_ou.yml + + # Import the 'ldap_groups.yml' task for adding groups to LDAP. + - import_tasks: ldap_groups.yml + + # Import the 'ldap_users.yml' task for adding users to LDAP. + - import_tasks: ldap_users.yml + + # Import the 'cleanup_secrets.yml' task for cleaning the confidential files stored locally. + - import_tasks: cleanup_secrets.yml + when: not ldap_config_status.stat.exists diff --git a/modules/ansible-roles/roles/ldap_server_prepare/vars/main.yml b/modules/ansible-roles/roles/ldap_server_prepare/vars/main.yml new file mode 100644 index 00000000..18f35e84 --- /dev/null +++ b/modules/ansible-roles/roles/ldap_server_prepare/vars/main.yml @@ -0,0 +1,13 @@ +# LDAP Server Variables + +LDAP_DIR: "/opt/LDAP" +LDAP_ADMIN_PASSWORD: "{{ ldap_admin_password }}" #pragma: allowlist secret +LDAP_GROUP: "{{ prefix }}" +LDAP_USER: "{{ ldap_user_name }}" +LDAP_USER_PASSWORD: "{{ ldap_user_password }}" #pragma: allowlist secret +BASE_DN: "{{ ldap_basedns }}" +LDAP_SERVER_IP: "{{ ldap_server }}" +LDAP_CERT_FILES_DIR: "/opt/ibm/terraform-ibm-hpc/modules/ansible-roles/ldap_key" +OPENLDAP_SERVER_PKGS: + - slapd + - ldap-utils diff --git a/modules/ansible-roles/roles/lsf_login_config/tasks/login_node_configuration.yml b/modules/ansible-roles/roles/lsf_login_config/tasks/login_node_configuration.yml new file mode 100644 index 00000000..6fa39b96 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_login_config/tasks/login_node_configuration.yml @@ -0,0 +1,80 @@ +--- + +- name: Check if LSF logs directory exists + stat: + path: "{{ LSF_LOGS }}" + register: logs_dir_stat + +- name: Ensure LSF logs directory exists (recurse only on first creation) + file: + path: "{{ LSF_LOGS }}" + state: directory + owner: lsfadmin + group: lsfadmin + mode: '0755' + recurse: "{{ not logs_dir_stat.stat.exists }}" + +- name: Ensure LSF conf and work are symlinks via shell + shell: | + [ -L "{{ LSF_TOP }}/{{ item }}" ] && echo "Symlink exists, skipping." || { \ + [ -d "{{ LSF_TOP }}/{{ item }}" ] && rm -rf "{{ LSF_TOP }}/{{ item }}"; \ + ln -s /mnt/lsf/lsf/{{ item }} "{{ LSF_TOP }}/{{ item }}"; } + loop: + - conf + - work + +- name: Ensure correct ownership and permissions of /opt/ibm/lsfsuite + file: + path: "{{ LSF_SUITE }}" + owner: lsfadmin + group: lsfadmin + mode: '0777' + recurse: yes + +- name: Set login_node_host to first host in login_node group + set_fact: + login_node_host: "{{ groups['login_node'][0] }}" + +- name: Get IPv4 address of the current host + shell: "getent ahostsv4 {{ inventory_hostname }} | awk '{ print $1; exit }'" + register: ip_result + changed_when: false + +- name: Ensure login node entry exists in LSF hosts file + lineinfile: + path: /mnt/lsf/lsf/conf/hosts + line: "{{ ip_result.stdout }} {{ login_node_host }}.{{ dns_domain_names }}" + state: present + insertafter: EOF + create: yes + +- name: Insert hostname line after "#prune" only once + lineinfile: + path: "{{ LSF_CLUSTER_FILE }}" + insertafter: "^#prune" + line: "{{ login_node_host }}.{{ dns_domain_names }} Intel_E5 X86_64 0 ()" + state: present + +- name: Ensure LSF profile is sourced in root's .bashrc + lineinfile: + path: "/root/.bashrc" + line: "source {{ LSF_CONF }}/profile.lsf" + state: present + +- name: Ensure LSF profile is sourced in lsfadmin's .bashrc + lineinfile: + path: "{{ LSFADMIN_DIR }}/.bashrc" + line: "source {{ LSF_CONF }}/profile.lsf" + state: present + +- name: Source current user's .bashrc (only if updated) + shell: | + grep -q "source {{ LSF_CONF }}/profile.lsf" /root/.bashrc && source /root/.bashrc || true + args: + executable: /bin/bash + +- name: Source lsfadmin's .bashrc (only if updated) + shell: | + grep -q "source {{ LSF_CONF }}/profile.lsf" "{{ LSFADMIN_DIR }}/.bashrc" && source "{{ LSFADMIN_DIR }}/.bashrc" || true + args: + executable: /bin/bash diff --git a/modules/ansible-roles/roles/lsf_login_config/tasks/main.yml b/modules/ansible-roles/roles/lsf_login_config/tasks/main.yml new file mode 100644 index 00000000..167fd89a --- /dev/null +++ b/modules/ansible-roles/roles/lsf_login_config/tasks/main.yml @@ -0,0 +1,4 @@ +--- + +# Configure Login node +- import_tasks: login_node_configuration.yml diff --git a/modules/ansible-roles/roles/lsf_login_config/vars/main.yml b/modules/ansible-roles/roles/lsf_login_config/vars/main.yml new file mode 100644 index 00000000..9328fea8 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_login_config/vars/main.yml @@ -0,0 +1,8 @@ +LSF_SUITE: "/opt/ibm/lsfsuite" +LSF_TOP: "{{ LSF_SUITE }}/lsf" +LSF_CONF: "{{ LSF_TOP }}/conf" +LSF_WORK: "{{ LSF_TOP }}/work" +LSF_LOGS: "/opt/ibm/lsflogs" +LSF_HOSTS_FILE: "{{ LSF_CONF }}/hosts" +LSF_CLUSTER_FILE: "{{ LSF_CONF }}/lsf.cluster.{{ prefix }}" +LSFADMIN_DIR: "/home/lsfadmin" diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/app_center_configure.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/app_center_configure.yml new file mode 100644 index 00000000..5f72237a --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/app_center_configure.yml @@ -0,0 +1,73 @@ +--- +# AppCenter HTTPS Configuration + +- name: PAC | Check if HTTPS is already enabled + ansible.builtin.command: pmcadmin https enable + register: https_check + changed_when: "'was already enabled' not in https_check.stdout" + failed_when: false + run_once: true + +- name: PAC | Debug HTTPS status + ansible.builtin.debug: + msg: "HTTPS is already enabled" + when: "'was already enabled' in https_check.stdout" + run_once: true + +- name: PAC | Configure HTTPS for AppCenter + block: + + - name: PAC | Set GUI password for lsfadmin + ansible.builtin.command: passwd --stdin lsfadmin + args: + stdin: "{{ app_center_gui_password }}" + + - name: PAC | Enable HTTPS access for AppCenter + ansible.builtin.command: > + pmcadmin https enable + --password {{ app_center_gui_password }} + --validhosts localhost + + - name: PAC | Stop pmcadmin service + ansible.builtin.command: pmcadmin stop + + - name: PAC | Pause before restarting pmcadmin + ansible.builtin.pause: + seconds: 5 + + - name: PAC | Start pmcadmin service + ansible.builtin.command: pmcadmin start + + - name: PAC | Update JS_PAC_SERVER_URL in js.conf + ansible.builtin.lineinfile: + path: "{{ JS_PAC_SERVER_URL }}" + regexp: '^JS_PAC_SERVER_URL=' + line: "JS_PAC_SERVER_URL=https://{{ lsf_masters[0] }}:8443" + backrefs: true + + - name: PAC | Stop ACD (Application Center Daemon) service + ansible.builtin.service: + name: acd + state: stopped + + - name: PAC | Pause before restarting ACD + ansible.builtin.pause: + seconds: 5 + + - name: PAC | Start ACD (Application Center Daemon) service + ansible.builtin.service: + name: acd + state: started + + rescue: + - name: PAC | Log error if AppCenter HTTPS configuration fails + ansible.builtin.debug: + msg: "AppCenter HTTPS configuration block failed. Check previous task results." + + always: + - name: PAC | Always log final status of AppCenter HTTPS configuration + ansible.builtin.debug: + msg: "AppCenter HTTPS configuration block completed (success or failure)." + + when: "'was already enabled' not in https_check.stdout" + run_once: true diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_dynamic_nodes_templates.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_dynamic_nodes_templates.yml new file mode 100644 index 00000000..8fa72065 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_dynamic_nodes_templates.yml @@ -0,0 +1,35 @@ +--- +- name: Management Config Templates| Copy credentials + template: + src: "templates/credentials.j2" + dest: "{{ LSF_RC_IC_CONF }}/credentials" + mode: '0644' + run_once: true + +- name: Management Config Templates | Copy ibmcloudgen2_config.json + template: + src: "templates/ibmcloudgen2_config.json.j2" + dest: "{{ LSF_RC_IC_CONF }}/ibmcloudgen2_config.json" + mode: '0644' + run_once: true + +- name: Management Config Templates | Copy ibmcloudgen2_templates.json + template: + src: "templates/ibmcloudgen2_templates.json.j2" + dest: "{{ LSF_RC_IC_CONF }}/ibmcloudgen2_templates.json" + mode: '0644' + run_once: true + +- name: Management Config Templates | Copy hostProviders.json + template: + src: "templates/hostProviders.json.j2" + dest: "{{ LSF_CONF_FILE_PATH }}/resource_connector/hostProviders.json" + mode: '0644' + run_once: true + +- name: Management Config Templates | Copy user_data.sh + template: + src: "templates/user_data.sh" + dest: "{{ LSF_RC_IC_CONF }}/user_data.sh" + mode: '0644' + run_once: true diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_management_nodes.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_management_nodes.yml new file mode 100644 index 00000000..670fc5f3 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/configure_management_nodes.yml @@ -0,0 +1,123 @@ +--- +- name: Management Config | Append LSF configuration settings + lineinfile: + path: "{{ LSF_CONF_FILE }}" + line: "{{ item }}" + create: yes + loop: + - "LSB_RC_EXTERNAL_HOST_IDLE_TIME=10" + - "LSF_DYNAMIC_HOST_WAIT_TIME=60" + - "LSF_DYNAMIC_HOST_TIMEOUT=\"EXPIRY[10m] THRESHOLD[250] INTERVAL[60m]\"" + - "LSB_RC_EXTERNAL_HOST_FLAG=\"icgen2host\"" + - "LSB_RC_UPDATE_INTERVAL=15" + - "LSB_RC_MAX_NEWDEMAND=50" + - "LSF_UDP_TO_TCP_THRESHOLD=9000" + - "LSF_CALL_LIM_WITH_TCP=Y" + - "LSF_ANNOUNCE_MASTER_TCP_WAITTIME=600" + - "LSF_RSH=\"ssh -o 'PasswordAuthentication no' -o 'StrictHostKeyChecking no'\"" + run_once: true + +- name: Management Config | Check if queue configuration already exists + shell: "grep -q '# ANSIBLE MANAGED: QUEUE_NAME added' '{{ LSF_LSBATCH_CONF }}/lsb.queues'" + register: queue_check + changed_when: false + failed_when: false + run_once: true + +- name: Management Config | Append LSF queue configuration to lsb.queues + blockinfile: + path: "{{ LSF_LSBATCH_CONF }}/lsb.queues" + insertafter: EOF + block: | + # ANSIBLE MANAGED: QUEUE_NAME added + Begin Queue + QUEUE_NAME=das_q + DATA_TRANSFER=Y + RC_HOSTS=all + HOSTS=all + RES_REQ=type==any + End Queue + marker: "" + when: queue_check.rc != 0 + run_once: true + +- name: Management Config | Uncomment "icgen2host" in lsf.shared + replace: + path: "{{ LSF_CONF_FILE_PATH }}/lsf.shared" + regexp: '^#\s*(icgen2host)' + replace: ' \1' + run_once: true + +- name: Management Config | Uncomment "schmod_demand" in lsb.modules + replace: + path: "{{ LSF_LSBATCH_CONF }}/lsb.modules" + regexp: '^#\s*(schmod_demand)' + replace: '\1' + run_once: true + +- name: Management Config | Check if RC_HOSTS modification was already done + stat: + path: "/tmp/rc_hosts_added" + register: rc_hosts_marker + run_once: true + +- name: Management Config | Add "RC_HOSTS = all" after QUEUE_NAME in lsb.queues using sed + shell: | + sed -i '/^Begin Queue$/,/^End Queue$/{/QUEUE_NAME/{N;s/\(QUEUE_NAME\s*=[^\n]*\)\n/\1\nRC_HOSTS = all\n/}}' "{{ LSF_LSBATCH_CONF }}/lsb.queues" + touch /tmp/rc_hosts_added + when: not rc_hosts_marker.stat.exists + run_once: true + +- name: Management Config | Append management hostnames to lsb.hosts + vars: + management_hostnames: "{{ lsf_masters_list.split() }}" + lineinfile: + path: "{{ LSF_LSBATCH_CONF }}/lsb.hosts" + insertafter: "^default !.*" + line: "{{ item }} 0 () () () () () (Y)" + state: present + loop: "{{ lsf_masters }}" + run_once: true + +- name: Management Config | Check if LSF_HOST_ADDR_RANGE is already set + shell: "grep -q '# ANSIBLE MANAGED: LSF_HOST_ADDR_RANGE added' '{{ LSF_CONF_FILE_PATH }}/lsf.cluster.{{ prefix }}'" + register: lsf_host_addr_range_marker_check + changed_when: false + failed_when: false + run_once: true + +- name: Management Config | Append LSF_HOST_ADDR_RANGE to lsf.cluster + blockinfile: + path: "{{ LSF_CONF_FILE_PATH }}/lsf.cluster.{{ prefix }}" + block: | + # ANSIBLE MANAGED: LSF_HOST_ADDR_RANGE added + Begin Parameters + LSF_HOST_ADDR_RANGE=10.*.*.* + End Parameters + marker: "" + when: lsf_host_addr_range_marker_check.rc != 0 + run_once: true + +- name: Management Config | Remove line containing 'lsfservers' + ansible.builtin.lineinfile: + path: "{{ LSF_CLUSTER_FILE }}" + regexp: '^lsfservers' + state: absent + run_once: true + +# Temporary: Remove after new image build includes cleanup +- name: Temporary Cleanup | Delete all 'sagar-fp-15-new1' folders + ansible.builtin.shell: | + find "{{ LSF_EXT_CONF }}" -type d -name "sagar-fp-15-new1" -exec rm -rf {} + + args: + warn: false + ignore_errors: true + when: inventory_hostname in groups['management_nodes'] + +# Temporary: Remove after new image build includes cleanup +- name: Temporary Cleanup | Replace 'sagar-fp-15-new1' with 'lsfservers' + ansible.builtin.shell: | + grep -rl 'sagar-fp-15-new1' "{{ LSF_EXT_CONF }}" | xargs sed -i 's/sagar-fp-15-new1/lsfservers/g' || true + args: + warn: false + when: inventory_hostname in groups['management_nodes'] diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hosts_file_update.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hosts_file_update.yml new file mode 100644 index 00000000..ca3c63fa --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hosts_file_update.yml @@ -0,0 +1,75 @@ +--- + +# Update the /etc/hosts file for Dynamic node Support + +- name: Management Config | Check if IP-to-host mapping already exists + shell: "grep -q '# ANSIBLE MANAGED: IP mapping added' '{{ LSF_HOSTS_FILE }}'" + register: ip_mapping_check + changed_when: false + failed_when: false + run_once: true + +- name: Management Config | Generate and append IP-to-host mapping to LSF hosts file + shell: | + echo "# ANSIBLE MANAGED: IP mapping added" >> '{{ LSF_HOSTS_FILE }}' + python3 -c "import ipaddress; \ + print('\\n'.join([str(ip) + ' {{ prefix }}-' + str(ip).replace('.', '-') \ + for ip in ipaddress.IPv4Network('{{ compute_subnets_cidr | first }}')]))" >> '{{ LSF_HOSTS_FILE }}' + args: + executable: /bin/bash + run_once: true + when: ip_mapping_check.rc != 0 + +- name: Management Config | Get IP addresses using getent + shell: "getent hosts {{ inventory_hostname }} | awk '{ print $1 }'" + register: dns_ip + changed_when: false + +- name: Management Config | Store IPs for each host + set_fact: + host_ip: "{{ dns_ip.stdout }}" + +- name: Management Config | Aggregate all IPs from all hosts + set_fact: + all_ips: "{{ groups['mgmt_compute_nodes'] | map('extract', hostvars, 'host_ip') | list }}" + run_once: true + +- name: Management Config | Display all resolved IP addresses + debug: + msg: "Resolved IPs: {{ all_ips }}" + run_once: true + +- name: Management Config | Check if each IP exists in LSF hosts file + shell: "grep -w '{{ item }}' {{ LSF_HOSTS_FILE }} || true" + register: ip_check + loop: "{{ all_ips }}" + changed_when: false + run_once: true + +- name: Management Config | Remove matched IPs from LSF hosts file if they exist + lineinfile: + path: "{{ LSF_HOSTS_FILE }}" + state: absent + regexp: "^{{ item.item }}\\s" + loop: "{{ ip_check.results }}" + when: item.stdout | length > 0 + run_once: true + +- name: Management Config | Ensure LSF hosts file exists + stat: + path: "{{ LSF_HOSTS_FILE }}" + register: lsf_hosts_stat + +- name: Management Config | Restore LSF hosts file if missing + copy: + src: "{{ HA_shared_dir }}/lsf/conf/hosts" + dest: "{{ LSF_HOSTS_FILE }}" + remote_src: yes + when: not lsf_hosts_stat.stat.exists + +- name: Management Config | Copy the Hosts file to /etc/hosts + copy: + src: "{{ LSF_HOSTS_FILE }}" + dest: /etc/hosts + remote_src: yes + ignore_errors: yes diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hyperthreading.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hyperthreading.yml new file mode 100644 index 00000000..87c96cd8 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/hyperthreading.yml @@ -0,0 +1,21 @@ +--- +# Hyperthreading Configuration + +- name: Hyperthreading | Define ego_define_ncpus based on hyperthreading + set_fact: + ego_define_ncpus: "{{ 'threads' if enable_hyperthreading else 'cores' }}" + run_once: true + +- name: Hyperthreading | Print the value of ego_define_ncpus + debug: + msg: "EGO_DEFINE_NCPUS is set to {{ ego_define_ncpus }}" + run_once: true + +- name: Hyperthreading | Set the EGO_DEFINE_NCPUS in LSF config file + lineinfile: + path: "{{ LSF_CONF_FILE }}" + line: "{{ item }}" + create: yes + loop: + - "EGO_DEFINE_NCPUS={{ ego_define_ncpus }}" + run_once: true diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/tasks/main.yml b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/main.yml new file mode 100644 index 00000000..1f745d31 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/tasks/main.yml @@ -0,0 +1,11 @@ +--- + +- import_tasks: hyperthreading.yml + +- import_tasks: configure_management_nodes.yml + +- import_tasks: hosts_file_update.yml + +- import_tasks: app_center_configure.yml + +- import_tasks: configure_dynamic_nodes_templates.yml diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/templates/credentials.j2 b/modules/ansible-roles/roles/lsf_mgmt_config/templates/credentials.j2 new file mode 100644 index 00000000..6127aeb1 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/templates/credentials.j2 @@ -0,0 +1,8 @@ +# BEGIN ANSIBLE MANAGED BLOCK +VPC_URL=http://vpc.cloud.ibm.com/v1 +VPC_AUTH_TYPE=iam +VPC_APIKEY={{ ibmcloud_api_key }} +RESOURCE_RECORDS_URL=https://api.dns-svcs.cloud.ibm.com/v1 +RESOURCE_RECORDS_AUTH_TYPE=iam +RESOURCE_RECORDS_APIKEY={{ ibmcloud_api_key }} +# END ANSIBLE MANAGED BLOCK diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/templates/hostProviders.json.j2 b/modules/ansible-roles/roles/lsf_mgmt_config/templates/hostProviders.json.j2 new file mode 100644 index 00000000..abf286a1 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/templates/hostProviders.json.j2 @@ -0,0 +1,10 @@ +{ + "providers":[ + { + "name": "ibmcloudgen2", + "type": "ibmcloudgen2Prov", + "confPath": "resource_connector/ibmcloudgen2", + "scriptPath": "resource_connector/ibmcloudgen2" + } + ] +} diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_config.json.j2 b/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_config.json.j2 new file mode 100644 index 00000000..d5b34149 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_config.json.j2 @@ -0,0 +1,17 @@ +{ + "IBMCLOUDGEN2_KEY_FILE": "{{ LSF_RC_IC_CONF }}/credentials", + "IBMCLOUDGEN2_PROVISION_FILE": "{{ LSF_RC_IC_CONF }}/user_data.sh", + "IBMCLOUDGEN2_MACHINE_PREFIX": "{{ prefix }}", + "LogLevel": "INFO", + "ApiEndPoints": { + "eu-gb": "https://eu-gb.iaas.cloud.ibm.com/v1", + "au-syd": "https://au-syd.iaas.cloud.ibm.com/v1", + "ca-tor": "https://ca-tor.iaas.cloud.ibm.com/v1", + "jp-osa": "https://jp-osa.iaas.cloud.ibm.com/v1", + "jp-tok": "https://jp-tok.iaas.cloud.ibm.com/v1", + "br-sao": "https://br-sao.iaas.cloud.ibm.com/v1", + "us-south": "https://us-south.iaas.cloud.ibm.com/v1", + "eu-de": "https://eu-de.iaas.cloud.ibm.com/v1", + "us-east": "https://us-east.iaas.cloud.ibm.com/v1" + } +} diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_templates.json.j2 b/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_templates.json.j2 new file mode 100644 index 00000000..358f5b11 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/templates/ibmcloudgen2_templates.json.j2 @@ -0,0 +1,25 @@ +{ + "templates": [ + { + "templateId": "Template-1", + "maxNumber": "{{ rc_max_num }}", + "attributes": { + "type": ["String", "X86_64"], + "ncores": ["Numeric", "{{ rc_ncores }}"], + "ncpus": ["Numeric", "{{ rc_ncpus }}"], + "mem": ["Numeric", "{{ rc_mem_in_mb }}"], + "icgen2host": ["Boolean", "1"] + }, + "crn": "{{ boot_volume_encryption_key }}", + "imageId": "{{ image_id }}", + "subnetId": "{{ compute_subnet_crn }}", + "vpcId": "{{ vpc_id }}", + "vmType": "{{ rc_profile }}", + "securityGroupIds": ["{{ compute_security_group_id | first }}"], + "resourceGroupId": "{{ resource_group_id }}", + "sshkey_id": "{{ compute_ssh_keys_ids | first }}", + "region": "{{ region_name }}", + "zone": "{{ zone_name | first }}" + } + ] +} diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/templates/user_data.sh b/modules/ansible-roles/roles/lsf_mgmt_config/templates/user_data.sh new file mode 100644 index 00000000..4d5336e1 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/templates/user_data.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +logfile="/tmp/user_data.log" +echo "START $(date '+%Y-%m-%d %H:%M:%S')" >>$logfile + +# Initialize variables +cluster_prefix="{{ prefix }}" +default_cluster_name="myCluster" +nfs_server_with_mount_path="{{ mount_paths_map['/mnt/lsf'] }}" +cloud_monitoring_access_key="{{ cloud_monitoring_access_key }}" +cloud_monitoring_ingestion_url="{{ cloud_monitoring_ingestion_url }}" +observability_monitoring_on_compute_nodes_enable="{{ monitoring_enable_for_compute }}" +observability_logs_enable_for_compute="{{ logs_enable_for_compute }}" +cloud_logs_ingress_private_endpoint="{{ cloud_logs_ingress_private_endpoint }}" +VPC_APIKEY_VALUE="{{ ibmcloud_api_key }}" +custom_file_shares="{% for key, value in mount_paths_map.items() if key != '/mnt/lsf' %}{{ value }}{% if not loop.last %} {% endif %}{% endfor %}" +custom_mount_paths="{% for key in mount_paths_map.keys() if key != '/mnt/lsf' %}{{ key }}{% if not loop.last %} {% endif %}{% endfor %}" +hyperthreading="{{ enable_hyperthreading }}" +ManagementHostNames="{{ lsf_masters | join(' ') }}" +dns_domain="{{ dns_domain_names }}" +network_interface="eth0" + +# LDAP +enable_ldap="{{ enable_ldap }}" +ldap_server="{{ ldap_server }}" +ldap_basedns="{{ ldap_basedns }}" + +# Setup Hostname +HostIP=$(hostname -I | awk '{print $1}') +hostname=${cluster_prefix}-${HostIP//./-} +hostnamectl set-hostname "${hostname}" + +# Setup vpcuser to login +if grep -E -q "CentOS|Red Hat" /etc/os-release; then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release; then + USER=ubuntu +fi +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys + +# Make lsfadmin and vpcuser set to newer expire +chage -I -1 -m 0 -M 99999 -E -1 -W 14 "${USER}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin + +# Setup Network configuration +if grep -q "NAME=\"Red Hat Enterprise Linux" /etc/os-release; then + echo "MTU=9000" >>"/etc/sysconfig/network-scripts/ifcfg-${network_interface}" + echo "DOMAIN=${dns_domain}" >>"/etc/sysconfig/network-scripts/ifcfg-${network_interface}" + gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) + cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) + echo "$cidr_range via $gateway_ip dev ${network_interface} metric 0 mtu 9000" >>/etc/sysconfig/network-scripts/route-${network_interface} + systemctl restart NetworkManager +fi + +# Function to attempt NFS mount with retries +mount_nfs_with_retries() { + local server_path=$1 + local client_path=$2 + local retries=5 + local success=false + + rm -rf "${client_path}" + mkdir -p "${client_path}" + + for ((j = 0; j < retries; j++)); do + mount -t nfs -o sec=sys "$server_path" "$client_path" -v >>$logfile + if mount | grep -q "${client_path}"; then + echo "Mount successful for ${server_path} on ${client_path}" >>$logfile + success=true + break + else + echo "Attempt $((j + 1)) of $retries failed for ${server_path} on ${client_path}" >>$logfile + sleep 2 + fi + done + + if [ "$success" = true ]; then + chmod 777 "${client_path}" + echo "${server_path} ${client_path} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" >>/etc/fstab + else + echo "Mount not found for ${server_path} on ${client_path} after $retries attempts." >>$logfile + rm -rf "${client_path}" + fi +} + +# Setup LSF share +if [ -n "${nfs_server_with_mount_path}" ]; then + echo "File share ${nfs_server_with_mount_path} found" >>$logfile + nfs_client_mount_path="/mnt/lsf" + if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then + echo "Mount completed successfully with ${nfs_client_mount_path}" >>$logfile + else + echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" >>$logfile + exit 1 + fi +fi +echo "Setting LSF share is completed." >>$logfile + +echo '{% raw %}' +# Setup Custom file shares +echo "Setting custom file shares." >>"$logfile" +if [ -n "${custom_file_shares}" ]; then + echo "Custom file share ${custom_file_shares} found" >>"$logfile" + read -ra file_share_array <<<"${custom_file_shares}" + read -ra mount_path_array <<<"${custom_mount_paths}" + length=${#file_share_array[@]} + + for ((i = 0; i < length; i++)); do + mount_nfs_with_retries "${file_share_array[$i]}" "${mount_path_array[$i]}" + done +fi +echo "Setting custom file shares is completed." >>"$logfile" +echo '{% endraw %}' + +# Setup SSH +LDAP_DIR="/home/lsfadmin" +SSH_DIR="$LDAP_DIR/.ssh" +mkdir -p "$SSH_DIR" +cp /home/vpcuser/.ssh/authorized_keys "$SSH_DIR/authorized_keys" +cat "{{ ha_shared_dir }}/ssh/id_rsa.pub" >>"$SSH_DIR/authorized_keys" +cp "{{ ha_shared_dir }}/ssh/id_rsa" "$SSH_DIR/id_rsa" +echo "StrictHostKeyChecking no" >>"$SSH_DIR/config" +chmod 600 "$SSH_DIR/authorized_keys" +chmod 400 "$SSH_DIR/id_rsa" +chmod 700 "$SSH_DIR" +chown -R lsfadmin:lsfadmin "$SSH_DIR" + +# Setup LSF environment variables +LSF_TOP="/opt/ibm/lsfsuite/lsf" +LSF_CONF="$LSF_TOP/conf" +LSF_WORK="$LSF_TOP/work" +LSF_CONF_FILE="$LSF_CONF/lsf.conf" +LSF_LOGS="/opt/ibm/lsflogs" +SHARED_HOSTS="/mnt/lsf/lsf/conf/hosts" +LSF_HOSTS_FILE="${LSF_CONF}/hosts" +SYSTEM_HOSTS_FILE="/etc/hosts" + +# Create a logs folder +mkdir -p $LSF_LOGS +chown -R lsfadmin $LSF_LOGS +chown -R 755 $LSF_LOGS + +# Append the line only if the exact search line is not already present +if ! grep -Fxq "search ${dns_domain}" /etc/resolv.conf; then + echo "search ${dns_domain}" >>/etc/resolv.conf + echo "Appended DNS entry: search ${dns_domain}" >>"$logfile" +else + echo "DNS entry 'search ${dns_domain}' is already present." >>"$logfile" +fi + +# Check if source file exists +if [[ -f "$SHARED_HOSTS" ]]; then + cp -p "$SHARED_HOSTS" "$LSF_HOSTS_FILE" + cp -p "$SHARED_HOSTS" "$SYSTEM_HOSTS_FILE" +else + echo "Error: Source file '$SHARED_HOSTS' does not exist." >&2 >>"$logfile" + exit 1 +fi + +# Apply system tuning parameters +LSF_TUNABLES="/etc/sysctl.conf" +{ + echo 'vm.overcommit_memory=1' + echo 'net.core.rmem_max=26214400' + echo 'net.core.rmem_default=26214400' + echo 'net.core.wmem_max=26214400' + echo 'net.core.wmem_default=26214400' + echo 'net.ipv4.tcp_fin_timeout = 5' + echo 'net.core.somaxconn = 8000' +} >>"$LSF_TUNABLES" +sudo sysctl -p $LSF_TUNABLES + +# Defining ncpus based on hyper-threading +if [ "$hyperthreading" == "True" ]; then + ego_define_ncpus="threads" +else + ego_define_ncpus="cores" + cat <<'EOT' >/root/lsf_hyperthreading +#!/bin/sh +for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do + echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" +done +EOT + chmod 755 /root/lsf_hyperthreading + command="/root/lsf_hyperthreading" + sh $command && ( + crontab -l 2>/dev/null + echo "@reboot $command" + ) | crontab - +fi +echo "EGO_DEFINE_NCPUS=${ego_define_ncpus}" >>$LSF_CONF_FILE + +# Main Configuration for Dynamic Nodes +sed -i 's|^LSF_LOGDIR=.*|LSF_LOGDIR="/opt/ibm/lsflogs"|' $LSF_CONF_FILE +sed -i '/^lsfservers/d' "$LSF_CONF/lsf.cluster.$cluster_prefix" +grep -rli "$default_cluster_name" $LSF_CONF/* | xargs sed -i "s/$default_cluster_name/$cluster_prefix/g" +mv $LSF_WORK/$default_cluster_name $LSF_WORK/"$cluster_prefix" +find "$LSF_TOP" -name "*$default_cluster_name*" -print0 | while IFS= read -r -d '' file; do + new_file=$(echo "$file" | sed -r "s/$default_cluster_name/$cluster_prefix/g") + mv "$file" "$new_file" +done +grep -rli 'lsfservers' $LSF_CONF/* | xargs sed -i "s/lsfservers/${ManagementHostNames}/g" + +cat <>$LSF_CONF_FILE +LSF_SERVER_HOSTS="${ManagementHostNames}" +LSF_ADDON_HOSTS="$(echo "$ManagementHostNames" | awk '{print $1}')" +LSF_GET_CONF=lim +LSF_GPU_AUTOCONFIG=Y +LSB_GPU_NEW_SYNTAX=extend +EOF + +# source profile.lsf +echo "source ${LSF_CONF}/profile.lsf" >>~/.bashrc +echo "source ${LSF_CONF}/profile.lsf" >>"$LDAP_DIR"/.bashrc +source "$HOME/.bashrc" +source "$LDAP_DIR/.bashrc" + +chown -R lsfadmin $LSF_TOP +chown -R lsfadmin $LSF_WORK + +# Restart the lsfd servive +service lsfd stop && sleep 2 && service lsfd start +sleep 10 + +# Setting up the LDAP configuration +if [ "$enable_ldap" = "true" ]; then + + # Detect if the operating system is RHEL or Rocky Linux + if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release || grep -q "NAME=\"Rocky Linux\"" /etc/os-release; then + + # Detect RHEL or Rocky version + version=$(grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}') + + # Proceed if the detected version is either 8 or 9 + if [ "$version" == "8" ] || [ "$version" == "9" ]; then + echo "Detected as RHEL or Rocky $version. Proceeding with LDAP client configuration..." >>$logfile + + # Enable password authentication for SSH by modifying the configuration file + sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config + systemctl restart sshd + + # Check if the SSL certificate file exists, then copy it to the correct location + # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries + for attempt in {1..5}; do + if [ -f "{{ ha_shared_dir }}/openldap/ldap_cacert.pem" ]; then + echo "LDAP SSL cert found under {{ ha_shared_dir }}/openldap/ldap_cacert.pem path" >>$logfile + mkdir -p /etc/openldap/certs/ + cp -pr "{{ ha_shared_dir }}/openldap/ldap_cacert.pem" "/etc/openldap/certs/ldap_cacert.pem" + break + else + echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >>$logfile + sleep 5 + fi + done + # Exit if the SSL certificate is still not found after 5 attempts + [ -f "{{ ha_shared_dir }}/openldap/ldap_cacert.pem" ] || { + echo "SSL cert not found after 5 attempts. Exiting." >>$logfile + exit 1 + } + + # Create and configure the SSSD configuration file for LDAP integration + cat </etc/sssd/sssd.conf +[sssd] +config_file_version = 2 +services = nss, pam, autofs +domains = default + +[nss] +homedir_substring = /home + +[pam] + +[domain/default] +id_provider = ldap +autofs_provider = ldap +auth_provider = ldap +chpass_provider = ldap +ldap_uri = ldap://${ldap_server} +ldap_search_base = dc=${ldap_basedns%%.*},dc=${ldap_basedns#*.} +ldap_id_use_start_tls = True +ldap_tls_cacertdir = /etc/openldap/certs +cache_credentials = True +ldap_tls_reqcert = allow +EOF + + # Secure the SSSD configuration file by setting appropriate permissions + chmod 600 /etc/sssd/sssd.conf + chown root:root /etc/sssd/sssd.conf + + # Create and configure the OpenLDAP configuration file for TLS + cat </etc/openldap/ldap.conf +BASE dc=${ldap_basedns%%.*},dc=${ldap_basedns#*.} +URI ldap://${ldap_server} +TLS_CACERT /etc/openldap/certs/ldap_cacert.pem +TLS_CACERTDIR /etc/openldap/certs +EOF + + # Rehash certificates in the OpenLDAP directory to ensure proper recognition + openssl rehash /etc/openldap/certs + + # Apply the SSSD and home directory creation configuration using authselect + authselect select sssd with-mkhomedir --force + + # Enable and start the SSSD and oddjobd services for user authentication and home directory management + systemctl enable --now sssd oddjobd + + # Restart both services to apply the configuration + systemctl restart sssd oddjobd + + # Validate the LDAP configuration by performing a test search using ldapsearch + if ldapsearch -x -H ldap://"${ldap_server}"/ -b "dc=${ldap_basedns%%.*},dc=${ldap_basedns#*.}" >/dev/null; then + echo "LDAP configuration completed successfully!" >>$logfile + else + echo "LDAP configuration failed! Exiting." >>$logfile + exit 1 + fi + + # Ensure LSF commands are available to all users by adding the profile to bashrc + echo ". ${LSF_CONF}/profile.lsf" >>/etc/bashrc + source /etc/bashrc + + else + echo "This script is intended for RHEL and Rocky Linux 8 or 9. Detected version: $version. Exiting." >>$logfile + exit 1 + fi + fi +else + echo "Skipping LDAP Client configuration as it is not enabled." >>$logfile +fi + +# Setting up the Cloud Monitoring Agent +if [ "$cloud_monitoring_access_key" != "" ] && [ "$cloud_monitoring_ingestion_url" != "" ]; then + + SYSDIG_CONFIG_FILE="/opt/draios/etc/dragent.yaml" + + #packages installation + echo "Writing sysdig config file" >>"$logfile" + + #sysdig config file + echo "Setting customerid access key" >>"$logfile" + sed -i "s/==ACCESSKEY==/$cloud_monitoring_access_key/g" $SYSDIG_CONFIG_FILE + sed -i "s/==COLLECTOR==/$cloud_monitoring_ingestion_url/g" $SYSDIG_CONFIG_FILE + echo "tags: type:compute,lsf:true" >>$SYSDIG_CONFIG_FILE +else + echo "Skipping metrics agent configuration due to missing parameters" >>"$logfile" +fi + +if [ "$observability_monitoring_on_compute_nodes_enable" = true ]; then + + echo "Restarting sysdig agent" >>"$logfile" + systemctl enable dragent + systemctl restart dragent +else + echo "Metrics agent start skipped since monitoring provisioning is not enabled" >>"$logfile" +fi + +# Setting up the IBM Cloud Logs +if [ "$observability_logs_enable_for_compute" = true ]; then + + echo "Configuring cloud logs for compute since observability logs for compute is enabled" + sudo cp /root/post-config.sh /opt/ibm + cd /opt/ibm || exit + + cat </etc/fluent-bit/fluent-bit.conf +[SERVICE] + Flush 1 + Log_Level info + Daemon off + Parsers_File parsers.conf + Plugins_File plugins.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 9001 + Health_Check On + HC_Errors_Count 1 + HC_Retry_Failure_Count 1 + HC_Period 30 + storage.path /fluent-bit/cache + storage.max_chunks_up 192 + storage.metrics On + +[INPUT] + Name syslog + Path /tmp/in_syslog + Buffer_Chunk_Size 32000 + Buffer_Max_Size 64000 + Receive_Buffer_Size 512000 + +[INPUT] + Name tail + Tag * + Path /opt/ibm/lsflogs/*.log.* + Path_Key file + Exclude_Path /var/log/at/** + DB /opt/ibm/lsflogs/fluent-bit.DB + Buffer_Chunk_Size 32KB + Buffer_Max_Size 256KB + Skip_Long_Lines On + Refresh_Interval 10 + storage.type filesystem + storage.pause_on_chunks_overlimit on + +[FILTER] + Name modify + Match * + Add subsystemName compute + Add applicationName lsf + +@INCLUDE output-logs-router-agent.conf +EOL + + sudo chmod +x post-config.sh + sudo ./post-config.sh -h "$cloud_logs_ingress_private_endpoint" -p "3443" -t "/logs/v1/singles" -a IAMAPIKey -k "$VPC_APIKEY_VALUE" --send-directly-to-icl -s true -i Production + echo "INFO Testing IBM Cloud LSF Logs from compute: $hostname" | sudo tee -a /opt/ibm/lsflogs/test.log.com >/dev/null + sudo logger -u /tmp/in_syslog my_ident my_syslog_test_message_from_compute:"$hostname" +else + echo "Cloud Logs configuration skipped since observability logs for compute is not enabled" +fi + +echo "COMPLETED $(date '+%Y-%m-%d %H:%M:%S')" >>$logfile diff --git a/modules/ansible-roles/roles/lsf_mgmt_config/vars/main.yml b/modules/ansible-roles/roles/lsf_mgmt_config/vars/main.yml new file mode 100644 index 00000000..d0d17636 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_mgmt_config/vars/main.yml @@ -0,0 +1,9 @@ +LSF_CONF_FILE_PATH: "/opt/ibm/lsfsuite/lsf/conf" +LSF_CONF_FILE: "{{ LSF_CONF_FILE_PATH }}/lsf.conf" +LSF_RC_IC_CONF: "{{ LSF_CONF_FILE_PATH }}/resource_connector/ibmcloudgen2/conf" +LSF_LSBATCH_CONF: "{{ LSF_CONF_FILE_PATH }}/lsbatch/{{ prefix }}/configdir" +LSF_HOSTS_FILE: "{{ LSF_CONF_FILE_PATH }}/hosts" +LSF_EGO_CONF_FILE: "{{ LSF_CONF_FILE }}/ego/{{ prefix }}/kernel/ego.conf" +LSF_CLUSTER_FILE: "{{ LSF_CONF_FILE_PATH }}/lsf.cluster.{{ prefix }}" +LSF_EXT_CONF: "/opt/ibm/lsfsuite/ext" +JS_PAC_SERVER_URL: "{{ LSF_EXT_CONF }}/ppm/conf/js.conf" diff --git a/modules/ansible-roles/roles/lsf_post_config/tasks/cluster_validation.yml b/modules/ansible-roles/roles/lsf_post_config/tasks/cluster_validation.yml new file mode 100644 index 00000000..f16b9361 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/tasks/cluster_validation.yml @@ -0,0 +1,43 @@ +--- +- name: MTU Check | Restart NetworkManager if MTU 9000 is not configured + ansible.builtin.shell: | + ip route show | grep -q 'mtu 9000' || { + systemctl restart NetworkManager + echo "restarted" + } + register: mtu_check_result + changed_when: "'restarted' in mtu_check_result.stdout" + when: inventory_hostname in groups['mgmt_compute_nodes'] + +- name: LSF Version | Retrieve lsid output + ansible.builtin.shell: lsid + register: lsid_output + changed_when: false + when: inventory_hostname == groups['management_nodes'][0] + +- name: LSF Version | Display lsid output + ansible.builtin.debug: + msg: "{{ lsid_output.stdout }}" + when: inventory_hostname == groups['management_nodes'][0] + +- name: Cluster Status | Fetch node status using bhosts + ansible.builtin.shell: bhosts -w + register: cluster_status_output + changed_when: false + when: inventory_hostname == groups['management_nodes'][0] + +- name: Cluster Status | Show node status + ansible.builtin.debug: + msg: "{{ cluster_status_output.stdout }}" + when: inventory_hostname == groups['management_nodes'][0] + +- name: Cluster Health | Restart lsfd if any node is unreach or unavail + ansible.builtin.shell: | + if bhosts -w | grep -Eq 'unreach|unavail'; then + systemctl restart lsfd + sleep 5 + echo "lsfd restarted" + fi + register: lsfd_restart_result + changed_when: "'lsfd restarted' in lsfd_restart_result.stdout" + when: inventory_hostname == groups['management_nodes'][0] diff --git a/modules/ansible-roles/roles/lsf_post_config/tasks/configure_shared_folders.yml b/modules/ansible-roles/roles/lsf_post_config/tasks/configure_shared_folders.yml new file mode 100644 index 00000000..fde333e0 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/tasks/configure_shared_folders.yml @@ -0,0 +1,116 @@ +--- + +# - name: Log directories | Remove duplicate logs +# ansible.builtin.shell: > +# find /opt/ibm/lsflogs -type f ! -name "*.{{ dns_domain_names }}" ! -name "ibmcloudgen2*" -delete +# become: true +# when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Setup shared base directories + file: + path: "{{ item.path }}" + state: directory + mode: "{{ item.mode | default('0755') }}" + recurse: yes + owner: lsfadmin + group: root + loop: + - { path: "{{ SHARED_PATH }}/data", mode: '0755' } + - { path: "{{ SHARED_PATH }}/logs", mode: '0755' } + - { path: "{{ SHARED_PATH }}/repository-path", mode: '0755' } + - { path: "{{ SHARED_PATH }}/das_staging_area", mode: '0755' } + when: inventory_hostname == groups['management_nodes'][0] + +- name: Log directories | Create per-host log directory under shared path + file: + path: "{{ SHARED_PATH }}/logs/{{ inventory_hostname }}" + state: directory + mode: '0777' + owner: lsfadmin + group: root + when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Move existing logs to shared per-host directory + shell: | + mv {{ LSF_LOGS }}/* {{ SHARED_PATH }}/logs/{{ inventory_hostname }}/ 2>/dev/null || true + args: + warn: false + when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Remove original LSF logs directory if it exists + file: + path: "{{ LSF_LOGS }}" + state: absent + ignore_errors: true + when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Create symlink from shared per-host logs to LSF log path + file: + src: "{{ SHARED_PATH }}/logs/{{ inventory_hostname }}" + dest: "{{ LSF_LOGS }}" + state: link + force: true + owner: lsfadmin + group: root + when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Ensure correct permissions on per-host logs directory + file: + path: "{{ SHARED_PATH }}/logs/{{ inventory_hostname }}" + state: directory + mode: '0777' + recurse: yes + owner: lsfadmin + group: root + when: inventory_hostname in groups['management_nodes'] + +- name: Log directories | Create symlink to shared data directory + file: + src: "{{ SHARED_PATH }}/data" + dest: "{{ LSF_TOP }}/work/data" + state: link + force: yes + when: inventory_hostname == groups['management_nodes'][0] + +- name: Log directories | Ensure ownership of shared data directory + file: + path: "{{ SHARED_PATH }}/data" + state: directory + recurse: yes + owner: lsfadmin + group: root + when: inventory_hostname == groups['management_nodes'][0] + +- name: Log directories | Ensure symlink for das_staging_area under LSF_TOP + shell: | + [ -L "{{ LSF_TOP }}/{{ item }}" ] && echo "Symlink exists, skipping." || { \ + [ -d "{{ LSF_TOP }}/{{ item }}" ] && rm -rf "{{ LSF_TOP }}/{{ item }}"; \ + ln -s "{{ SHARED_PATH }}/{{ item }}" "{{ LSF_TOP }}/{{ item }}"; } + loop: + - das_staging_area + when: inventory_hostname == groups['management_nodes'][0] + +- name: LoginNode host entry | Read LSF hosts file from shared path + slurp: + src: "{{ SHARED_PATH }}/lsf/conf/hosts" + register: lsf_hosts_file + when: inventory_hostname == groups['login_node'][0] + +- name: LoginNode host entry | Append LSF Login hosts to /etc/hosts + blockinfile: + path: /etc/hosts + create: yes + marker: "# {mark} LSF HOSTS BLOCK" + insertafter: EOF + block: | + {{ lsf_hosts_file.content | b64decode }} + become: yes + when: inventory_hostname == groups['login_node'][0] + +- name: LoginNode host entry | Change ownership of /etc/hosts + ansible.builtin.file: + path: /etc/hosts + owner: lsfadmin + group: root + become: yes + when: inventory_hostname == groups['login_node'][0] diff --git a/modules/ansible-roles/roles/lsf_post_config/tasks/main.yml b/modules/ansible-roles/roles/lsf_post_config/tasks/main.yml new file mode 100644 index 00000000..9ce4a08f --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/tasks/main.yml @@ -0,0 +1,13 @@ +--- + +# Set correct ownership and permissions on shared directories +- import_tasks: permissions_setup.yml + +# Configure shared folders and create necessary symlinks on management nodes +- import_tasks: configure_shared_folders.yml + +# Reload or restart services to apply the new configurations +- import_tasks: reload_services.yml + +# Cluster validation +- import_tasks: cluster_validation.yml diff --git a/modules/ansible-roles/roles/lsf_post_config/tasks/permissions_setup.yml b/modules/ansible-roles/roles/lsf_post_config/tasks/permissions_setup.yml new file mode 100644 index 00000000..0c3beebf --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/tasks/permissions_setup.yml @@ -0,0 +1,23 @@ +--- + +# Set recursive permissions for LSF Suite directory to 0755 +- name: Change permissions of lsfsuite to 0755 + ansible.builtin.command: "chmod -R 0755 {{ LSF_SUITE }}" + ignore_errors: true + +# Set ownership of LSF Suite directory to lsfadmin +- name: Change ownership of lsfsuite to lsfadmin + ansible.builtin.command: "chown -R lsfadmin {{ LSF_SUITE }}" + ignore_errors: true + +# Set recursive permissions for shared path +- name: Change permissions of {{ SHARED_PATH }} to 0755 + ansible.builtin.command: "chmod -R 0755 {{ SHARED_PATH }}" + when: inventory_hostname == groups['management_nodes'][0] + ignore_errors: true + +# Set ownership of shared path to lsfadmin +- name: Change ownership of {{ SHARED_PATH }} to lsfadmin + ansible.builtin.command: "chown -R lsfadmin {{ SHARED_PATH }}" + when: inventory_hostname == groups['management_nodes'][0] + ignore_errors: true diff --git a/modules/ansible-roles/roles/lsf_post_config/tasks/reload_services.yml b/modules/ansible-roles/roles/lsf_post_config/tasks/reload_services.yml new file mode 100644 index 00000000..53d5712e --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/tasks/reload_services.yml @@ -0,0 +1,14 @@ +--- + +# Restart LSF daemon (lsfd) service on the first management node +- name: Restart lsfd service + service: + name: lsfd + state: restarted + when: inventory_hostname == groups['management_nodes'][0] + +# Restart the NetworkManager service on all nodes +- name: Restart NetworkManager + service: + name: NetworkManager + state: restarted diff --git a/modules/ansible-roles/roles/lsf_post_config/vars/main.yml b/modules/ansible-roles/roles/lsf_post_config/vars/main.yml new file mode 100644 index 00000000..6e28bb54 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_post_config/vars/main.yml @@ -0,0 +1,6 @@ +--- + +SHARED_PATH: "/mnt/lsf" +LSF_SUITE: "/opt/ibm/lsfsuite" +LSF_TOP: "{{ LSF_SUITE }}/lsf" +LSF_LOGS: "/opt/ibm/lsflogs" diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/disable_ansible_repo.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/disable_ansible_repo.yml new file mode 100644 index 00000000..2c87eaa3 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/disable_ansible_repo.yml @@ -0,0 +1,12 @@ +--- +# Disabling Ansible repo if it is enabled. + +- name: Ansible Repo | Check if the Ansible repo is listed + shell: yum repolist | grep -q '^ansible-2-for-rhel-8-x86_64-rpms' + register: ansible_repo_check + ignore_errors: true + changed_when: false + +- name: Ansible Repo | Disable Ansible repo + command: subscription-manager repos --disable=ansible-2-for-rhel-8-x86_64-rpms + when: ansible_repo_check.rc == 0 diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/hyperthreading.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/hyperthreading.yml new file mode 100644 index 00000000..ad289c83 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/hyperthreading.yml @@ -0,0 +1,22 @@ +--- +- name: Hyperthreading | Create LSF hyperthreading script for disabling threads if hyperthreading is false + copy: + dest: "{{ hyperthreading_file }}" + content: | + #!/bin/sh + for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do + echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" + done + mode: '0755' + when: not enable_hyperthreading + +- name: Hyperthreading | Run the hyperthreading script and add to cron if hyperthreading is false + shell: "{{ hyperthreading_file }}" + when: not enable_hyperthreading + +- name: Hyperthreading | Add script to cron for reboot if hyperthreading is false + cron: + name: "Disable Hyperthreading" + special_time: reboot + job: "{{ hyperthreading_file }}" + when: not enable_hyperthreading diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsf_tunables.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsf_tunables.yml new file mode 100644 index 00000000..dbfd1c9a --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsf_tunables.yml @@ -0,0 +1,20 @@ +# Update the Network Tunables +--- +- name: LSF_Tunables | Ensure sysctl parameters are set + ansible.builtin.lineinfile: + path: "{{ sysctl_conf }}" + line: "{{ item }}" + create: yes + loop: + - "vm.overcommit_memory=1" + - "net.core.rmem_max=26214400" + - "net.core.rmem_default=26214400" + - "net.core.wmem_max=26214400" + - "net.core.wmem_default=26214400" + - "net.ipv4.tcp_fin_timeout=5" + - "net.core.somaxconn=8000" + +- name: LSF_Tunables | Apply sysctl settings + ansible.builtin.command: + cmd: sysctl -p "{{ sysctl_conf }}" + changed_when: false diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_creation.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_creation.yml new file mode 100644 index 00000000..1db4d6c8 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_creation.yml @@ -0,0 +1,25 @@ +--- +# Create and configure the lsfadmin user +- name: lsfadmin | Check if lsfadmin user exists + ansible.builtin.getent: + database: passwd #pragma: allowlist secret + key: lsfadmin + register: user_info + ignore_errors: true + +- name: lsfadmin | Create lsfadmin user if not present + ansible.builtin.user: + name: lsfadmin + comment: "LSF Admin User" + create_home: yes + home: /home/lsfadmin + shell: /bin/bash + uid: 1005 + when: user_info.failed + register: user_created + +- name: lsfadmin | Provide sudo access to lsfadmin only if new user is created + ansible.builtin.shell: "echo 'lsfadmin ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers" #pragma: allowlist secret + args: + executable: /bin/bash + when: user_created is changed diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_password_less_auth.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_password_less_auth.yml new file mode 100644 index 00000000..8e0293eb --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/lsfadmin_password_less_auth.yml @@ -0,0 +1,86 @@ +--- +- name: Passwordless SSH | Create necessary directories + file: + path: "{{ item }}" + state: directory + mode: '0755' + loop: + - "{{ HA_shared_dir }}/ssh" + run_once: true + +- name: Passwordless SSH | Validate compute public and private key content + fail: + msg: "compute_public_key_content or compute_private_key_content is empty. Exiting." + when: (compute_public_key_content | length == 0) or (compute_private_key_content | length == 0) + +- name: Passwordless SSH | Decode and copy compute public and private key contents + shell: echo "{{ item.content }}" | base64 --decode > "{{ item.dest }}" + loop: + - { content: "{{ compute_public_key_content }}", dest: "{{ HA_shared_dir }}/ssh/id_rsa.pub" } + - { content: "{{ compute_private_key_content }}", dest: "{{ HA_shared_dir }}/ssh/id_rsa" } + no_log: true + run_once: true + +- name: Passwordless SSH | Create necessary directories + file: + path: "{{ item }}" + state: directory + mode: '0755' + loop: + - /home/lsfadmin + - "{{ lsfadmin_ssh_path }}" + +- name: Passwordless SSH | Set permissions for shared SSH directory + file: + path: "{{ HA_shared_dir }}/ssh" + state: directory + mode: '0777' + recurse: yes + +- name: Passwordless SSH | Copy authorized_keys to lsfadmin's .ssh + copy: + src: /home/vpcuser/.ssh/authorized_keys + dest: "{{ lsfadmin_ssh_path }}/authorized_keys" + remote_src: yes + owner: lsfadmin + group: lsfadmin + mode: '0600' + +- name: Passwordless SSH | Copy public key content to authorized_keys + command: "cat {{ HA_shared_dir }}/ssh/id_rsa.pub" + register: pub_key_content + changed_when: false + +- name: Passwordless SSH | Append public key to authorized_keys + lineinfile: + path: "{{ lsfadmin_ssh_path }}/authorized_keys" + line: "{{ pub_key_content.stdout }}" + owner: lsfadmin + group: lsfadmin + mode: '0600' + +- name: Passwordless SSH | Copy private key to lsfadmin's .ssh + copy: + src: "{{ HA_shared_dir }}/ssh/id_rsa" + dest: "{{ lsfadmin_ssh_path }}/id_rsa" + remote_src: yes + owner: lsfadmin + group: lsfadmin + mode: '0400' + +- name: Passwordless SSH | Disable StrictHostKeyChecking + lineinfile: + path: "{{ lsfadmin_ssh_path }}/config" + line: "StrictHostKeyChecking no" + create: yes + owner: lsfadmin + group: lsfadmin + mode: '0644' + +- name: Passwordless SSH | Ensure proper permissions on .ssh directory + file: + path: "{{ lsfadmin_ssh_path }}" + state: directory + owner: lsfadmin + group: lsfadmin + mode: '0700' diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/main.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/main.yml new file mode 100644 index 00000000..9338ee90 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/main.yml @@ -0,0 +1,19 @@ +--- + +# MTU Configuration +- import_tasks: mtu_configuration.yml + +# lsf_tunables configuration +- import_tasks: lsf_tunables.yml + +# Create lsfadmin user +- import_tasks: lsfadmin_creation.yml + +# Setup Password less authentication for lsfadmin user +- import_tasks: lsfadmin_password_less_auth.yml + +# Disable Hyperthreading +- import_tasks: hyperthreading.yml + +# Disable Ansible repo +- import_tasks: disable_ansible_repo.yml diff --git a/modules/ansible-roles/roles/lsf_prereq_config/tasks/mtu_configuration.yml b/modules/ansible-roles/roles/lsf_prereq_config/tasks/mtu_configuration.yml new file mode 100644 index 00000000..9e13ab78 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/tasks/mtu_configuration.yml @@ -0,0 +1,32 @@ +--- +- name: MTU | Check if MTU and Domain are already configured + ansible.builtin.shell: | + grep -q '^MTU=9000' {{ network_script_path }}/ifcfg-{{ network_interface }} && \ + grep -q '^DOMAIN={{ dns_domain_names }}' {{ network_script_path }}/ifcfg-{{ network_interface }} + register: mtu_check + failed_when: false + changed_when: false + ignore_errors: true + +- name: MTU | Set MTU and Domain in network script + ansible.builtin.blockinfile: + path: "{{ network_script_path }}/ifcfg-{{ network_interface }}" + block: | + MTU=9000 + DOMAIN={{ dns_domain_names }} + marker: "# {mark} ANSIBLE MANAGED BLOCK" + when: mtu_check.rc != 0 + +- name: MTU | Get default gateway and CIDR range + ansible.builtin.shell: | + gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) + cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) + echo "$cidr_range via $gateway_ip dev {{ network_interface }} metric 0 mtu 9000" + register: route + changed_when: false + +- name: MTU | Set MTU at the router level + ansible.builtin.lineinfile: + path: "{{ network_script_path }}/route-{{ network_interface }}" + line: "{{ route.stdout }}" + create: yes diff --git a/modules/ansible-roles/roles/lsf_prereq_config/vars/main.yml b/modules/ansible-roles/roles/lsf_prereq_config/vars/main.yml new file mode 100644 index 00000000..39a523b8 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_prereq_config/vars/main.yml @@ -0,0 +1,5 @@ +network_interface: "eth0" +network_script_path: "/etc/sysconfig/network-scripts" +sysctl_conf: "/etc/sysctl.conf" +lsfadmin_ssh_path: "/home/lsfadmin/.ssh" +hyperthreading_file: "/root/lsf_hyperthreading" diff --git a/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_inventory.yml b/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_inventory.yml new file mode 100644 index 00000000..8b48552f --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_inventory.yml @@ -0,0 +1,44 @@ +--- +# tasks file for lsf_templates + +- name: LSF | FP14 | Create inventory file from template + template: + src: fp14-inventory.j2 + dest: "{{ inventory_path }}/lsf-inventory" + delegate_to: localhost + + run_once: true + when: lsf_version == "fixpack_14" + +- name: LSF | FP14 | Create config file from template + template: + src: fp14-config.j2 + dest: "{{ inventory_path }}/lsf-config.yml" + delegate_to: localhost + run_once: true + when: lsf_version == "fixpack_14" + +- name: LSF | FP15 | Create inventory file from template + template: + src: fp15-inventory.j2 + dest: "{{ inventory_path }}/lsf-inventory" + delegate_to: localhost + run_once: true + when: lsf_version == "fixpack_15" + +- name: LSF | FP15 | Create config file from template + template: + src: fp15-config.j2 + dest: "{{ inventory_path }}/lsf-config.yml" + delegate_to: localhost + run_once: true + when: lsf_version == "fixpack_15" + +# Update deployer_hostname in group_vars/all for both FP14 and FP15 +- name: LSF | Update deployer_hostname in group_vars/all + ansible.builtin.lineinfile: + path: "{{ inventory_path }}/group_vars/all" + regexp: "^deployer_hostname: .*" + line: "deployer_hostname: {{ lsf_deployer_hostname }}" + delegate_to: localhost + run_once: true diff --git a/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_prepare.yml b/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_prepare.yml new file mode 100644 index 00000000..0dd65bf1 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/tasks/lsf_prepare.yml @@ -0,0 +1,29 @@ +--- +# Installing required prerequisite packages for the LSF cluster creation. +- name: LSF | Check OS version for package installation + ansible.builtin.shell: "grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}'" + register: rhel_version + changed_when: false + +- name: LSF | Get installed packages + package_facts: + manager: auto + +- name: LSF | Ensure lsof and ipmitool packages are installed if missing + dnf: + name: "{{ ['lsof', 'ipmitool'] | difference(ansible_facts.packages.keys() | list) }}" + state: present + when: + - rhel_version.stdout in ['8', '9'] + - (['lsof', 'ipmitool'] | difference(ansible_facts.packages.keys() | list)) | length > 0 + +# Create the shared LSF directory +- name: LSF | Display the last directory + debug: + msg: "The last directory is: {{ lsf_dir }}" + +- name: LSF | Create base directories for LSF configuration + ansible.builtin.file: + path: "/mnt/lsf/{{ lsf_dir }}" + state: directory + mode: "0777" diff --git a/modules/ansible-roles/roles/lsf_template_config/tasks/main.yml b/modules/ansible-roles/roles/lsf_template_config/tasks/main.yml new file mode 100644 index 00000000..942cc3f4 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/tasks/main.yml @@ -0,0 +1,10 @@ +--- + +# install required prerequisite packages for the LSF cluster creation. +- import_tasks: lsf_prepare.yml + +# tasks file for template tasks +- import_tasks: lsf_inventory.yml + +# Install Python dependencies +- import_tasks: python_installation.yml diff --git a/modules/ansible-roles/roles/lsf_template_config/tasks/python_installation.yml b/modules/ansible-roles/roles/lsf_template_config/tasks/python_installation.yml new file mode 100644 index 00000000..6743d277 --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/tasks/python_installation.yml @@ -0,0 +1,66 @@ +--- + +# Install and Configure Python +# Check if Python 3.11 is installed +- name: Python Installation | Verify if Python 3.11 is installed + ansible.builtin.shell: "python3.11 --version" + register: python_check + ignore_errors: yes + changed_when: false + +# Install required packages if Python 3.11 is missing +- name: Python Installation | Install prerequisite packages + ansible.builtin.yum: + name: + - python3.11 + - ed + - libnsl + - python3.11-pip + state: present + when: python_check.rc != 0 + +# Remove old Python 3 and pip3 symbolic links if Python 3.11 is newly installed +- name: Python Installation | Remove existing symbolic links for Python 3 and pip3 + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - /usr/bin/python3 + - /bin/pip3 + - /etc/alternatives/python3 + when: python_check.rc != 0 + +# Create symbolic links for Python 3.11 and pip3.11 +- name: Python Installation | Create symbolic links for Python 3.11 and pip3.11 + ansible.builtin.file: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + state: link + loop: + - { src: "/usr/bin/python3.11", dest: "/usr/bin/python3" } + - { src: "/usr/bin/pip3.11", dest: "/bin/pip3" } + - { src: "/usr/bin/python3.11", dest: "/etc/alternatives/python3" } + when: python_check.rc != 0 + +- name: Python Installation | Install IBM Cloud Networking Services, SDK Core, and selinux + ansible.builtin.shell: "python3.11 -m pip install ibm-cloud-networking-services ibm-cloud-sdk-core selinux" + args: + executable: /bin/bash + when: python_check.rc != 0 + +- name: Python Installation | Install requests, PyYAML, Ansible, Ansible Core, and jmespath + ansible.builtin.shell: "python3.11 -m pip install ibm-vpc==0.10.0 requests==2.27.1 pyyaml==6.0 ansible==5.9.0 ansible-core==2.12.6 jmespath==1.0.1" + args: + executable: /bin/bash + when: python_check.rc != 0 + +# Set proper permissions for Python 3.11 library directories +- name: Python Installation | Adjust permissions for Python 3.11 library directories + ansible.builtin.file: + path: "{{ item }}" + mode: '0755' + recurse: yes + loop: + - /usr/local/lib/python3.11 + - /usr/local/lib64/python3.11 + when: python_check.rc != 0 diff --git a/modules/ansible-roles/roles/lsf_template_config/templates/fp14-config.j2 b/modules/ansible-roles/roles/lsf_template_config/templates/fp14-config.j2 new file mode 100644 index 00000000..fdc6c7ff --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/templates/fp14-config.j2 @@ -0,0 +1,162 @@ +--- +#----------------------------------- +# Copyright IBM Corp. 1992, 2017. All rights reserved. +# US Government Users Restricted Rights - Use, duplication or disclosure +# restricted by GSA ADP Schedule Contract with IBM Corp. +#----------------------------------- + +LSF: + # Set my_cluster_name to the name of the cluster. + my_cluster_name: {{ prefix }} + + # Enable_Monitoring controls whether + # system monitoring functions are turned on. + # Set to False to disable monitoring or True + # to enable monitoring. + Enable_Monitoring: {{ Enable_Monitoring }} + + # HA_shared_dir is the shared directory for High-Availability (HA). + # If set to a directory, the installation copies + # the configuration files and work directory + # contents to the specified directory, and updates the configuration + # to point to it. + # Set to "none" if an HA shared directory is not used, + # or if you use an NFS shared directory defined with + # NFS_install_dir below. + # HA_shared_dir: /shared/directory/path + HA_shared_dir: {{ HA_shared_dir }} + + # NFS_install_dir is a shared directory that contains + # the LSF master, server and client binary files, man pages, + # and configuration files. When defined, the directory + # is also used for HA, so HA_shared_dir above + # should be set to "none". + # NFS_install_dir: /nfs-or-gpfs-directory + # Optionally the LSF_Masters may be installed locally, \ + # to permit upgrading of LSF Masters independently \ + # of the rest of the cluster. This is controlled \ + # by the LSF_MASTERS_ON_LOCAL parameter in: \ + # /opt/ibm/lsf_installer/playbook/group_vars/all + NFS_install_dir: {{ NFS_install_dir }} + + # JDBC_string is the connection string for the + # optional external database. If a host has + # been given the role of "database" in the + # lsf-inventory file, the specified value is set + # automatically. If you do not specify a host in the DB_Host role + # in the lsf-inventory file, you must define a JDBC_string. + # The external database must be created + # by sourcing the database schema files: + # - Copy the /opt/ibm/lsf_installer/DBschema/MySQL/*.sql files on the deployer machine to the remote database host + # - Create a database that is named pac + # - Create a database user who is named pacuser + # - Grant this user all privileges on the pac database + # - Run the schema files for the database + # For more information, see + # "https://www.ibm.com/support/knowledgecenter/SSZRJV_10.2.0/install_guide/pac_install_config.html". + # Set the following environment variables on the deployer machine: + # - JDBC_USER + # - JDBC_PASSWORD + # + #JDBC_string: jdbc:mariadb://[:]/?useUnicode=true&characterEncoding=UTF-8&serverTimezone=GMT + # + JDBC_string: none + + # (Optional) Primary_LSF_ADMIN is a primary LSF administrator. + # + # The primary LSF administrator account must exist + # on all hosts in the cluster before you install LSF. + # + # Note that the primary LSF administrator is lsfadmin by default. + # if the lsfadmin user does not exist, the installation creates lsfadmin + # with UID 495. The primary administrator owns the LSF configuration files + # and log files for job events. + # + #Primary_LSF_ADMIN: user_name + + # (Optional) Secondary_LSF_ADMINS is a space delimited + # list of secondary LSF administrators. + # + # LSF administrators have permission to reconfigure LSF + # and to control batch jobs that are submitted by other users. + # Secondary administrators typically do not have permission to start LSF + # daemons. Usually, only root has permission to start LSF daemons. + # + # All secondary LSF administrator accounts must exist + # on all hosts in the cluster before you install LSF. + # + #Secondary_LSF_ADMINS: user_name1 user_name2 ... + + # (Optional) LSF_Ports is a space delimited + # list of LSF port numbers. + # + # LSF has the following default port number values listed below. + # All are used for TCP, except LSF_LIM_PORT which also uses UDP. + #LSF_LIM_PORT: 7869 + #LSF_RES_PORT: 6878 + #LSB_SBD_PORT: 6882 + #LSB_MBD_PORT: 6881 + #LSB_QUERY_PORT: 6891 + #LSF_DATA_PORT: 9998 + #EGO_KD_PORT: 7870 + #EGO_PEM_PORT: 7871 + #ESC_PORT: 7872 + # + # Specify either nine individual port numbers or + # a range with the starting port number followed by '-'. + # A port number must be an integer in the range 1024 - 65535 + # except for any of the following: + # 1966, 1967, 1968, 5000, 6080, 8046, 8047, 8048, 8080, 8081, 8443, 8444. + # For example, + # LSF_Ports: 9991 9992 9993 9994 9995 9996 9997 9998 9999 + # LSF_Ports: 9991 - + # where '-' indicates eight consecutive numbers from the leading number + # + #LSF_Ports: port_number1 [ - | port_number2 ... port_number9 ] + + # (Optional) Private_IPv4_Range allows you to specify a range of private IPv4 + # addresses used by LSF hosts. + # This parameter can be used in scenarios where the LSF master host has both + # public and private IP addresses, but the compute nodes have only private IP + # addresses. + # Specify a range of IPv4 addresses in the form of a Classless Inter-Domain + # Routing (CIDR) block. + # For example, Private_IPv4_Range: 10.10.99.0/8 + #Private_IPv4_Range: none + + # The ES_SERVERS parameter sets the list of the Elasticsearch servers where + # Elasticsearch 7.2, or later, is running. + # + # It contains the URLs to which Elasticsearch is listening and enables + # LSF Explorer to use the Elasticsearch features. + # + # Enter a space-separated list of the Elasticserch server URLs in the following format: + # http:// or https:// IP_address or host_name : port number + # For example, + # ES_SERVERS: "http://hostA:9200 http://hostB:9200" + # + # If the first server does not respond, the next server in the list is contacted. + # + #ES_SERVERS: "http://ES_host_ip:9200" + + # ES_SSL_CERTIFICATE_PATH must be configured when + # the protocol Elasticsearch (scheme of the URL) is https. + # The file path must be accessible on the deployer machine. + # For example, + #ES_SSL_CERTIFICATE_PATH: /path/to/cert_file.crt + + # LOGSTASH_TOP is the top directory of the Logstash installation. + # Use this parameter to enable the energy accounting and the gpfsio-collector service. + # When LOGSTASH_TOP is defined, ES_SERVERS must also be defined in lsf-config.yml. + # + # The directory path must point to the top directory of Logstash on a host in the GUI_Hosts role. + # For example, LOGSTASH_TOP: /path/to/logstash/top/directory + # + # If the Logstash path.settings is not set to LOGSTASH_TOP/config, + # make a symbolic link for LOGSTASH_TOP/config to the Logstash path.settings directory. + # For example, + # ln -s /etc/logstash LOGSTASH_TOP/config + # + #LOGSTASH_TOP: none + +# END OF LSF-CONFIG.YML diff --git a/modules/ansible-roles/roles/lsf_template_config/templates/fp14-inventory.j2 b/modules/ansible-roles/roles/lsf_template_config/templates/fp14-inventory.j2 new file mode 100644 index 00000000..b15eabfb --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/templates/fp14-inventory.j2 @@ -0,0 +1,58 @@ +# LSF Suites Inventory file +[local] +localhost +# Use this file to define the machines there roles +# A machine can belong to more than one role, but +# should only belong to one LSF_xxxxx role. + +# LSF_Masters are the machines LSF will use to run the +# management processes. For HA there must be 2 or more, +# and the shared filesystem must be available. +[LSF_Masters] +{% for host in lsf_masters %} +{{ host }} +{% endfor %} + +# LSF_Servers are machines that LSF will use to run jobs. +# Expressions can be used to represent a number of +# machines e.g. +# host[1:100] == host1, host2, host3, ... host100 +# host[a:f] == hosta, hostb, hostc, ... hostf +[LSF_Servers] +{% for host in lsf_servers %} +{{ host }} +{% endfor %} + +# LSF_Clients are machines that cannot run work, but can +# submit jobs via the CLI, and query the cluster. +# These are optional. +[LSF_Clients] +{% for host in lsf_clients %} +{{ host }} +{% endfor %} + +# GUI_Hosts are machines that will run the GUI and +# other supporting services. A minimum of 1 machine +# needs to be a GUI host. If there is no HA_shared_dir +# then this must be set to the LSF_Masters host. +# Use public (external) host name if the machine has +# multiple NICs. Make sure the machine can be ping-able +# by using both its public IP address and the host +# name reported by the hostname command and vice versa. +[GUI_Hosts] +{% for host in gui_hosts %} +{{ host }} +{% endfor %} + +# DB_HOST is optional, and is the machine that hosts the database +# used by the Application Center component in LSF Suite. +# However, this database is not configured for High Availability (HA). +# To enable HA for this database, manually create the database using MariaDB +# and configure it to be HA-ready, then set the JDBC_string parameter +# in the /opt/ibm/lsf_installer/playbook/lsf-config.yml file to specify the database connection. +[DB_Host] +{% for host in db_hosts %} +{{ host }} +{% endfor %} + +[LSF_WebService] diff --git a/modules/ansible-roles/roles/lsf_template_config/templates/fp15-config.j2 b/modules/ansible-roles/roles/lsf_template_config/templates/fp15-config.j2 new file mode 100644 index 00000000..44cad94c --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/templates/fp15-config.j2 @@ -0,0 +1,219 @@ +--- +#----------------------------------- +# Copyright IBM Corp. 1992, 2017. All rights reserved. +# US Government Users Restricted Rights - Use, duplication or disclosure +# restricted by GSA ADP Schedule Contract with IBM Corp. +#----------------------------------- + +LSF: + # Set my_cluster_name to the name of the cluster. + my_cluster_name: {{ prefix }} + + # Enable_Monitoring controls whether + # system monitoring functions are turned on. + # Set to False to disable monitoring or True + # to enable monitoring. + Enable_Monitoring: {{ Enable_Monitoring }} + + # HA_shared_dir is the shared directory for High-Availability (HA). + # If set to a directory, the installation copies + # the configuration files and work directory + # contents to the specified directory, and updates the configuration + # to point to it. + # Set to "none" if an HA shared directory is not used, + # or if you use an NFS shared directory defined with + # NFS_install_dir below. + # HA_shared_dir: /shared/directory/path + HA_shared_dir: {{ HA_shared_dir }} + + # NFS_install_dir is a shared directory that contains + # the LSF master, server and client binary files, man pages, + # and configuration files. When defined, the directory + # is also used for HA, so HA_shared_dir above + # should be set to "none". + # NFS_install_dir: /nfs-or-gpfs-directory + # Optionally the LSF_Masters may be installed locally, \ + # to permit upgrading of LSF Masters independently \ + # of the rest of the cluster. This is controlled \ + # by the LSF_MASTERS_ON_LOCAL parameter in: \ + # /opt/ibm/lsf_installer/playbook/group_vars/all + NFS_install_dir: {{ NFS_install_dir }} + + # JDBC_string is the connection string for the + # optional external database. If a host has + # been given the role of "database" in the + # lsf-inventory file, the specified value is set + # automatically. If you do not specify a host in the DB_Host role + # in the lsf-inventory file, you must define a JDBC_string. + # The external database must be created + # by sourcing the database schema files: + # - Copy the /opt/ibm/lsf_installer/DBschema/MySQL/*.sql files on the deployer machine to the remote database host + # - Create a database that is named pac + # - Create a database user who is named pacuser + # - Grant this user all privileges on the pac database + # - Run the schema files for the database + # For more information, see + # "https://www.ibm.com/support/knowledgecenter/SSZRJV_10.2.0/install_guide/pac_install_config.html". + # Set the following environment variables on the deployer machine: + # - JDBC_USER + # - JDBC_PASSWORD + # + #JDBC_string: jdbc:mariadb://[:]/?useUnicode=true&characterEncoding=UTF-8&serverTimezone=GMT + # + JDBC_string: none + + # (Optional) Primary_LSF_ADMIN is a primary LSF administrator. + # + # The primary LSF administrator account must exist + # on all hosts in the cluster before you install LSF. + # + # Note that the primary LSF administrator is lsfadmin by default. + # if the lsfadmin user does not exist, the installation creates lsfadmin + # with UID 495. The primary administrator owns the LSF configuration files + # and log files for job events. + # + #Primary_LSF_ADMIN: user_name + + # (Optional) Secondary_LSF_ADMINS is a space delimited + # list of secondary LSF administrators. + # + # LSF administrators have permission to reconfigure LSF + # and to control batch jobs that are submitted by other users. + # Secondary administrators typically do not have permission to start LSF + # daemons. Usually, only root has permission to start LSF daemons. + # + # All secondary LSF administrator accounts must exist + # on all hosts in the cluster before you install LSF. + # + #Secondary_LSF_ADMINS: user_name1 user_name2 ... + + # (Optional) LSF_Ports is a space delimited + # list of LSF port numbers. + # + # LSF has the following default port number values listed below. + # All are used for TCP, except LSF_LIM_PORT which also uses UDP. + #LSF_LIM_PORT: 7869 + #LSF_RES_PORT: 6878 + #LSB_SBD_PORT: 6882 + #LSB_MBD_PORT: 6881 + #LSB_QUERY_PORT: 6891 + #LSF_DATA_PORT: 9998 + #EGO_KD_PORT: 7870 + #EGO_PEM_PORT: 7871 + #ESC_PORT: 7872 + # + # Specify either nine individual port numbers or + # a range with the starting port number followed by '-'. + # A port number must be an integer in the range 1024 - 65535 + # except for any of the following: + # 1966, 1967, 1968, 5000, 6080, 8046, 8047, 8048, 8080, 8081, 8443, 8444. + # For example, + # LSF_Ports: 9991 9992 9993 9994 9995 9996 9997 9998 9999 + # LSF_Ports: 9991 - + # where '-' indicates eight consecutive numbers from the leading number + # + #LSF_Ports: port_number1 [ - | port_number2 ... port_number9 ] + + # (Optional) Private_IPv4_Range allows you to specify a range of private IPv4 + # addresses used by LSF hosts. + # This parameter can be used in scenarios where the LSF master host has both + # public and private IP addresses, but the compute nodes have only private IP + # addresses. + # Specify a range of IPv4 addresses in the form of a Classless Inter-Domain + # Routing (CIDR) block. + # For example, Private_IPv4_Range: 10.10.99.0/8 + #Private_IPv4_Range: none + + # The ES_SERVERS parameter sets the list of the Elasticsearch servers where + # Elasticsearch 7.2, or later, is running. + # + # It contains the URLs to which Elasticsearch is listening and enables + # LSF Explorer to use the Elasticsearch features. + # + # Enter a space-separated list of the Elasticserch server URLs in the following format: + # http:// or https:// IP_address or host_name : port number + # For example, + # ES_SERVERS: "http://hostA:9200 http://hostB:9200" + # + # If the first server does not respond, the next server in the list is contacted. + # + #ES_SERVERS: "http://ES_host_ip:9200" + + # ES_SSL_CERTIFICATE_PATH must be configured when + # the protocol Elasticsearch (scheme of the URL) is https. + # The file path must be accessible on the deployer machine. + # For example, + #ES_SSL_CERTIFICATE_PATH: /path/to/cert_file.crt + + # LOGSTASH_TOP is the top directory of the Logstash installation. + # Use this parameter to enable the energy accounting and the gpfsio-collector service. + # When LOGSTASH_TOP is defined, ES_SERVERS must also be defined in lsf-config.yml. + # + # The directory path must point to the top directory of Logstash on a host in the GUI_Hosts role. + # For example, LOGSTASH_TOP: /path/to/logstash/top/directory + # + # If the Logstash path.settings is not set to LOGSTASH_TOP/config, + # make a symbolic link for LOGSTASH_TOP/config to the Logstash path.settings directory. + # For example, + # ln -s /etc/logstash LOGSTASH_TOP/config + # + #LOGSTASH_TOP: none + +# options for the LSF Web Service +LWS: + # HTTP_MODE: + #-------------------------------- + # Has a value of either http or https. Default http. + + # HTTP_PORT: + #-------------------------------- + # This is the port that will be used if http mode is used. Default of 8088. + + # HTTPS_PORT: + #-------------------------------- + # This is the port that will be used if https mode is used. Default of 8448. + + # SSL_VALID_HOSTS: "host1 host2" + # --------------------- + # {REQUIRED IF YOU ENABLE HTTPS WITH HIGH AVAILABILITY DURING INSTALLATION} + # List of valid hosts that will be configured in the SSL Certificate. + # + # By default, IBM Spectrum LSF Web Service enables HTTPS. + # In non-silent installation, you can override the default behavior and choose to not enable HTTPS. + # Specify IBM Spectrum LSF Web Service servers that will be configured in the + # SSL security certificate used for IBM Spectrum LSF Web Service. Only hosts + # defined in the SSL Certificate will have successful SSL connections + # without any warnings. + # + # You can specify short host names, fully qualified host names and IP addresses. + # Separate multiple entries with a space. + # + # For High Availability, include the IBM HTTP Server and any Platform + # Web Service servers you want to be reachable. + + # LWS_CLUSTER_NAME: + # ----------------- + # Enables High Availability (HA) in IBM Spectrum LSF Web Service. + # + # Specify a name for the group of hosts that represents your IBM Spectrum LSF Web Service servers. + # Valid values are alphanumeric characters. + # + # The cluster name you specify here is used to create an IBM WebSphere cluster, and is written to + # the configuration file $LWS_CONFDIR/profile.lws. You can change the cluster name after installation if desired. + # + # IMPORTANT: You cannot enable High Availability after installation without reinstalling LSF Web Service. + # + # Default: empty, High Availability is not enabled + + # SHARED_CONFIGURATION_DIR: "/scratch/product/share/configuration" + # ----------------- + # Shared location of the configuration and work directory, used for High Availability. + # Required for High Availability if the directory '/opt/ibm/lsfsuite' + # is a local directory. If the directory '/opt/ibm/lsfsuite' is on a shared file system, + # do not specify this directory as the configuration and work directories are already + # in a shared location. + # + # Default: undefined, the configuration and work directory are installed on the local + # host, within '/opt/ibm/lsfsuite'. + +# END OF LSF-CONFIG.YML diff --git a/modules/ansible-roles/roles/lsf_template_config/templates/fp15-inventory.j2 b/modules/ansible-roles/roles/lsf_template_config/templates/fp15-inventory.j2 new file mode 100644 index 00000000..ed5a37bc --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/templates/fp15-inventory.j2 @@ -0,0 +1,65 @@ +# LSF Suites Inventory file +[local] +localhost +# Use this file to define the machines there roles +# A machine can belong to more than one role, but +# should only belong to one LSF_xxxxx role. + +# LSF_Masters are the machines LSF will use to run the +# management processes. For HA there must be 2 or more, +# and the shared filesystem must be available. +[LSF_Masters] +{% for host in lsf_masters %} +{{ host }} +{% endfor %} + + +# LSF_Servers are machines that LSF will use to run jobs. +# Expressions can be used to represent a number of +# machines e.g. +# host[1:100] == host1, host2, host3, ... host100 +# host[a:f] == hosta, hostb, hostc, ... hostf +[LSF_Servers] +{% for host in lsf_servers %} +{{ host }} +{% endfor %} + +# LSF_Clients are machines that cannot run work, but can +# submit jobs via the CLI, and query the cluster. +# These are optional. +[LSF_Clients] +{% for host in lsf_clients %} +{{ host }} +{% endfor %} + +# GUI_Hosts are machines that will run the GUI and +# other supporting services. A minimum of 1 machine +# needs to be a GUI host. If there is no HA_shared_dir +# then this must be set to the LSF_Masters host. +# Use public (external) host name if the machine has +# multiple NICs. Make sure the machine can be ping-able +# by using both its public IP address and the host +# name reported by the hostname command and vice versa. +[GUI_Hosts] +{% for host in gui_hosts %} +{{ host }} +{% endfor %} + + +# DB_HOST is optional, and is the machine that hosts the database +# used by the Application Center component in LSF Suite. +# However, this database is not configured for High Availability (HA). +# To enable HA for this database, manually create the database using MariaDB +# and configure it to be HA-ready, then set the JDBC_string parameter +# in the /opt/ibm/lsf_installer/playbook/lsf-config.yml file to specify the database connection. +[DB_Host] +{% for host in db_hosts %} +{{ host }} +{% endfor %} + +# LSF_WebService is an optional role for hosts LSF will use to run the +# LSF Web Service. The default is for the primary host to be the LWS host. +[LSF_WebService] +{% for host in gui_hosts %} +{{ host }} +{% endfor %} diff --git a/modules/ansible-roles/roles/lsf_template_config/vars/main.yml b/modules/ansible-roles/roles/lsf_template_config/vars/main.yml new file mode 100644 index 00000000..360951cf --- /dev/null +++ b/modules/ansible-roles/roles/lsf_template_config/vars/main.yml @@ -0,0 +1,6 @@ +--- + +# Static Variables + +inventory_path: "/opt/ibm/lsf_installer/playbook/" +lsf_dir: "{{ HA_shared_dir | basename }}" diff --git a/modules/ansible-roles/roles/vpc_fileshare_config/handlers/main.yml b/modules/ansible-roles/roles/vpc_fileshare_config/handlers/main.yml new file mode 100644 index 00000000..85aa54c4 --- /dev/null +++ b/modules/ansible-roles/roles/vpc_fileshare_config/handlers/main.yml @@ -0,0 +1,3 @@ +--- +- name: Mount NFS + command: mount -a diff --git a/modules/ansible-roles/roles/vpc_fileshare_config/tasks/main.yml b/modules/ansible-roles/roles/vpc_fileshare_config/tasks/main.yml new file mode 100644 index 00000000..3d999267 --- /dev/null +++ b/modules/ansible-roles/roles/vpc_fileshare_config/tasks/main.yml @@ -0,0 +1,4 @@ +--- + +# tasks file for fileshare mount +- import_tasks: vpc_fileshare_configure.yml diff --git a/modules/ansible-roles/roles/vpc_fileshare_config/tasks/vpc_fileshare_configure.yml b/modules/ansible-roles/roles/vpc_fileshare_config/tasks/vpc_fileshare_configure.yml new file mode 100644 index 00000000..9e47e6e8 --- /dev/null +++ b/modules/ansible-roles/roles/vpc_fileshare_config/tasks/vpc_fileshare_configure.yml @@ -0,0 +1,72 @@ +--- +- name: Show VPC File Share mappings + ansible.builtin.debug: + msg: "{{ item.key }}: {{ item.value }}" + with_dict: "{{ name_mount_path_map }}" + +- name: Show NFS Share mappings + ansible.builtin.debug: + msg: "{{ item.key }}: {{ item.value }}" + with_dict: "{{ nfs_shares_map }}" + +- name: Create base directories for VPC File shares + file: + path: "{{ item.key }}" + state: directory + mode: '0777' + with_dict: "{{ name_mount_path_map }}" + +- name: Add entries to /etc/fstab for VPC File shares + lineinfile: + path: /etc/fstab + line: "{{ item.value }} {{ item.key }} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" + state: present + create: yes + with_dict: "{{ name_mount_path_map }}" + notify: Mount NFS + +- name: Create base directories for all NFS mount points + file: + path: "{{ item.key }}" + state: directory + mode: '0777' + with_dict: "{{ nfs_shares_map }}" + +- name: Mount NFS shares + ansible.builtin.mount: + path: "{{ item.key }}" + src: "{{ item.value }}" + fstype: nfs + opts: rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev + state: mounted + with_dict: "{{ nfs_shares_map }}" + +- name: Validate that NFS shares were mounted successfully + shell: mount | grep "on {{ item.key }} type nfs" + register: mount_check_nfs + with_dict: "{{ nfs_shares_map }}" + changed_when: false + failed_when: mount_check_nfs is failed + +- name: Flush handlers immediately + meta: flush_handlers + +- name: Verify mounted filesystems + command: df -h + register: df_output + changed_when: false + failed_when: df_output.rc != 0 + +- name: Check directory listing + command: ls -ltr /mnt/lsf + register: ls_output + changed_when: false + failed_when: ls_output.rc != 0 + +- name: Set appropriate permissions on base dirs + ansible.builtin.shell: | + chmod -R {{ (item.key == '/mnt/lsf') | ternary('0755', '0777') }} "{{ item.key }}" + args: + warn: false + with_dict: "{{ name_mount_path_map }}" + run_once: true diff --git a/modules/baremetal/datasource.tf b/modules/baremetal/datasource.tf new file mode 100644 index 00000000..71e65999 --- /dev/null +++ b/modules/baremetal/datasource.tf @@ -0,0 +1,8 @@ +data "ibm_resource_group" "existing_resource_group" { + name = var.existing_resource_group +} + +data "ibm_is_image" "storage" { + count = length(var.storage_servers) + name = var.storage_servers[count.index]["image"] +} diff --git a/modules/baremetal/locals.tf b/modules/baremetal/locals.tf new file mode 100644 index 00000000..6fba7bc8 --- /dev/null +++ b/modules/baremetal/locals.tf @@ -0,0 +1,14 @@ +# define variables +locals { + prefix = var.prefix + storage_image_id = data.ibm_is_image.storage[*].id + storage_node_name = format("%s-%s", local.prefix, "strg") + resource_group_id = data.ibm_resource_group.existing_resource_group.id + bms_interfaces = ["ens1", "ens2"] + #storage_ssh_keys = [for name in var.storage_ssh_keys : data.ibm_is_ssh_key.storage[name].id] + + # TODO: explore (DA always keep it true) + #skip_iam_authorization_policy = true + storage_server_count = sum(var.storage_servers[*]["count"]) + enable_storage = local.storage_server_count > 0 +} diff --git a/modules/baremetal/main.tf b/modules/baremetal/main.tf new file mode 100644 index 00000000..9f77bbb3 --- /dev/null +++ b/modules/baremetal/main.tf @@ -0,0 +1,23 @@ +module "storage_key" { + count = local.enable_storage ? 1 : 0 + source = "./../key" +} + +module "storage_baremetal" { + source = "terraform-ibm-modules/bare-metal-vpc/ibm" + version = "1.1.0" + count = length(var.storage_servers) + server_count = var.storage_servers[count.index]["count"] + prefix = count.index == 0 ? local.storage_node_name : format("%s-%s", local.storage_node_name, count.index) + profile = var.storage_servers[count.index]["profile"] + image_id = local.storage_image_id[count.index] + create_security_group = false + subnet_ids = var.storage_subnets + ssh_key_ids = var.storage_ssh_keys + bandwidth = var.bandwidth + allowed_vlan_ids = var.allowed_vlan_ids + access_tags = null + resource_group_id = local.resource_group_id + security_group_ids = var.security_group_ids + user_data = data.template_file.storage_user_data.rendered +} diff --git a/modules/baremetal/outputs.tf b/modules/baremetal/outputs.tf new file mode 100644 index 00000000..1f429c38 --- /dev/null +++ b/modules/baremetal/outputs.tf @@ -0,0 +1,14 @@ +output "list" { + description = "A list of VSI with name, id, zone, and primary ipv4 address" + value = flatten([ + for module_instance in module.storage_baremetal : [ + for server_key, server_details in module_instance.baremetal_servers : + { + id = server_details.bms_server_id + name = server_details.bms_server_name + ipv4_address = try(server_details.bms_server_ip, "") + vni_id = server_details.bms_vni_id + } + ] + ]) +} diff --git a/modules/baremetal/template_files.tf b/modules/baremetal/template_files.tf new file mode 100644 index 00000000..a7117b26 --- /dev/null +++ b/modules/baremetal/template_files.tf @@ -0,0 +1,10 @@ +data "template_file" "storage_user_data" { + template = file("${path.module}/templates/storage_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + storage_public_key_content = local.enable_storage ? module.storage_key[0].public_key_content : "" + storage_private_key_content = local.enable_storage ? module.storage_key[0].private_key_content : "" + storage_interfaces = local.bms_interfaces[0] + storage_dns_domain = var.dns_domain_names["storage"] + } +} diff --git a/modules/baremetal/templates/storage_user_data.tpl b/modules/baremetal/templates/storage_user_data.tpl new file mode 100644 index 00000000..31f15e6b --- /dev/null +++ b/modules/baremetal/templates/storage_user_data.tpl @@ -0,0 +1,120 @@ +#!/usr/bin/bash + +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/usr/bin/env bash +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys + +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${storage_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${storage_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +# if grep -q "Red Hat" /etc/os-release +if grep -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + else + PACKAGE_MGR=yum + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" + fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +yum update --security -y +yum versionlock $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + +echo "###########################################################################################" >> /etc/motd +echo "# You have logged in to Storage BareMetal Server. #" >> /etc/motd +echo "# - Server storage is temporary storage that's available only while your Baremetal #" >> /etc/motd +echo "# server is running. #" >> /etc/motd +echo "# - Data on the drive is unrecoverable after server shutdown, disruptive maintenance, #" >> /etc/motd +echo "# or hardware failure. #" >> /etc/motd +echo "# #" >> /etc/motd +echo "# Refer: https://cloud.ibm.com/docs/vpc?topic=vpc-bare-metal-servers-storage #" >> /etc/motd +echo "###########################################################################################" >> /etc/motd + +echo "DOMAIN=${storage_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 +systemctl restart NetworkManager + +systemctl stop firewalld +firewall-offline-cmd --zone=public --add-port=1191/tcp +firewall-offline-cmd --zone=public --add-port=4444/tcp +firewall-offline-cmd --zone=public --add-port=4444/udp +firewall-offline-cmd --zone=public --add-port=4739/udp +firewall-offline-cmd --zone=public --add-port=4739/tcp +firewall-offline-cmd --zone=public --add-port=9084/tcp +firewall-offline-cmd --zone=public --add-port=9085/tcp +firewall-offline-cmd --zone=public --add-service=http +firewall-offline-cmd --zone=public --add-service=https +firewall-offline-cmd --zone=public --add-port=2049/tcp +firewall-offline-cmd --zone=public --add-port=2049/udp +firewall-offline-cmd --zone=public --add-port=111/tcp +firewall-offline-cmd --zone=public --add-port=111/udp +firewall-offline-cmd --zone=public --add-port=30000-61000/tcp +firewall-offline-cmd --zone=public --add-port=30000-61000/udp +systemctl start firewalld +systemctl enable firewalld diff --git a/modules/baremetal/variables.tf b/modules/baremetal/variables.tf new file mode 100644 index 00000000..f24d57c6 --- /dev/null +++ b/modules/baremetal/variables.tf @@ -0,0 +1,109 @@ +############################################################################## +# Resource Groups Variables +############################################################################## + +variable "existing_resource_group" { + description = "String describing resource groups to create or reference" + type = string + default = null +} + +variable "prefix" { + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + type = string + + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.prefix)) + } +} + +############################################################################## +# Scale Storage Variables +############################################################################## + +/*variable "storage_subnets" { + type = list(object({ + name = string + id = string + zone = string + cidr = string + })) + default = [] + description = "Subnets to launch the storage host." +}*/ + +variable "storage_subnets" { + type = list(string) + description = "Subnets to launch the storage host." +} + +variable "storage_ssh_keys" { + type = list(string) + description = "The key pair to use to launch the storage cluster host." +} + +variable "storage_servers" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "cx2d-metal-96x192" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/gpfs/fs1" + }] + description = "Number of BareMetal Servers to be launched for storage cluster." +} + +variable "bandwidth" { + description = "The allocated bandwidth (in Mbps) for the bare metal server to manage network traffic. If unset, default values apply." + type = number + default = 100000 +} + +variable "allowed_vlan_ids" { + description = "A list of VLAN IDs that are permitted for the bare metal server, ensuring network isolation and control. Example: [100, 102]" + type = list(number) + default = ["100", "102"] +} + +variable "security_group_ids" { + description = "A list of security group ID's" + type = list(string) + default = [] +} + +############################################################################## +# Access Variables +############################################################################## + +variable "bastion_public_key_content" { + type = string + sensitive = true + default = null + description = "Bastion security group id." +} + +############################################################################## +# DNS Template Variables +############################################################################## + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + } + description = "IBM Cloud HPC DNS domain names." +} diff --git a/modules/baremetal/version.tf b/modules/baremetal/version.tf new file mode 100644 index 00000000..b87bee94 --- /dev/null +++ b/modules/baremetal/version.tf @@ -0,0 +1,18 @@ +############################################################################## +# Terraform Providers +############################################################################## + +terraform { + required_version = ">= 1.9.0" + # Use "greater than or equal to" range for root level modules + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + template = { + source = "hashicorp/template" + version = "~> 2" + } + } +} diff --git a/modules/bootstrap/datasource.tf b/modules/bootstrap/datasource.tf deleted file mode 100644 index 3c983a58..00000000 --- a/modules/bootstrap/datasource.tf +++ /dev/null @@ -1,9 +0,0 @@ -data "ibm_is_image" "bastion" { - name = local.bastion_image_name -} - -# Existing Bastion details -data "ibm_is_instance" "bastion_instance_name" { - count = var.bastion_instance_name != null ? 1 : 0 - name = var.bastion_instance_name -} diff --git a/modules/bootstrap/locals.tf b/modules/bootstrap/locals.tf deleted file mode 100644 index 9de0ff38..00000000 --- a/modules/bootstrap/locals.tf +++ /dev/null @@ -1,95 +0,0 @@ -# define variables -locals { - name = "lsf" - prefix = var.prefix - tags = [local.prefix, local.name] - - schematics_reserved_cidrs = [ - "169.44.0.0/14", - "169.60.0.0/14", - "158.175.0.0/16", - "158.176.0.0/15", - "141.125.0.0/16", - "161.156.0.0/16", - "149.81.0.0/16", - "159.122.111.224/27", - "150.238.230.128/27", - "169.55.82.128/27" - ] - - bastion_sg_variable_cidr = flatten([ - local.schematics_reserved_cidrs, - var.allowed_cidr - # var.network_cidr - ]) - - bastion_node_name = format("%s-%s", local.prefix, "bastion") - - bastion_machine_type = "cx2-4x8" - bastion_image_name = "ibm-ubuntu-22-04-4-minimal-amd64-3" - - bastion_image_id = data.ibm_is_image.bastion.id - - bastion_sg_variable_cidr_list = var.network_cidr - # Security group rules - # TODO: Fix SG rules - bastion_security_group_rules = flatten([ - [for cidr in local.bastion_sg_variable_cidr : { - name = format("allow-variable-inbound-%s", index(local.bastion_sg_variable_cidr, cidr) + 1) - direction = "inbound" - remote = cidr - # ssh port - tcp = { - port_min = 22 - port_max = 22 - } - }], - [for cidr in local.bastion_sg_variable_cidr : { - name = format("allow-variable-outbound-%s", index(local.bastion_sg_variable_cidr, cidr) + 1) - direction = "outbound" - remote = cidr - }], - [for cidr in local.bastion_sg_variable_cidr_list : { - name = format("allow-variable-inbound-cidr-%s", index(local.bastion_sg_variable_cidr_list, cidr) + 1) - direction = "inbound" - remote = cidr - tcp = { - port_min = 22 - port_max = 22 - } - }], - [for cidr in local.bastion_sg_variable_cidr_list : { - name = format("allow-variable-outbound-cidr-%s", index(local.bastion_sg_variable_cidr_list, cidr) + 1) - direction = "outbound" - remote = cidr - }] - ]) - - # Derived configs - # VPC - - # Subnets - bastion_subnets = var.bastion_subnets - - # Bastion Security group rule update to connect with login node - bastion_security_group_rule_update = [ - { - name = "inbound-rule-for-login-node-connection" - direction = "inbound" - remote = var.bastion_security_group_id - } - ] - - # Bastion Security Group rule update with LDAP server - ldap_security_group_rule = [ - { - name = "inbound-rule-for-ldap-node-connection" - direction = "inbound" - remote = var.ldap_server - tcp = { - port_min = 389 - port_max = 389 - } - } - ] -} diff --git a/modules/bootstrap/main.tf b/modules/bootstrap/main.tf deleted file mode 100644 index 088e44cf..00000000 --- a/modules/bootstrap/main.tf +++ /dev/null @@ -1,67 +0,0 @@ -module "ssh_key" { - count = 1 - source = "./../key" - # private_key_path = "bastion_id_rsa" #checkov:skip=CKV_SECRET_6 -} - -module "bastion_sg" { - count = 1 - source = "terraform-ibm-modules/security-group/ibm" - version = "2.6.2" - add_ibm_cloud_internal_rules = true - resource_group = var.resource_group - security_group_name = format("%s-bastion-sg", local.prefix) - security_group_rules = local.bastion_security_group_rules - vpc_id = var.vpc_id - tags = local.tags -} - -module "bastion_sg_with_ldap_update" { - count = var.ldap_server == "null" ? 0 : 1 - source = "terraform-ibm-modules/security-group/ibm" - version = "2.6.2" - resource_group = var.resource_group - add_ibm_cloud_internal_rules = true - use_existing_security_group_id = true - existing_security_group_id = module.bastion_sg[0].security_group_id - security_group_rules = local.ldap_security_group_rule - vpc_id = var.vpc_id - depends_on = [module.bastion_sg] -} - -module "existing_bastion_sg_update" { - count = var.bastion_security_group_id != null ? 1 : 0 - source = "terraform-ibm-modules/security-group/ibm" - version = "2.6.2" - resource_group = var.resource_group - add_ibm_cloud_internal_rules = true - use_existing_security_group_id = true - existing_security_group_id = var.bastion_security_group_id - security_group_rules = local.bastion_security_group_rule_update - vpc_id = var.vpc_id - depends_on = [module.bastion_sg] -} - -module "bastion_vsi" { - count = var.bastion_instance_name != null ? 0 : 1 - source = "terraform-ibm-modules/landing-zone-vsi/ibm" - version = "5.0.0" - vsi_per_subnet = 1 - create_security_group = false - security_group = null - image_id = local.bastion_image_id - machine_type = local.bastion_machine_type - prefix = local.bastion_node_name - resource_group_id = var.resource_group - enable_floating_ip = true - security_group_ids = module.bastion_sg[*].security_group_id - ssh_key_ids = var.ssh_keys - subnets = length(var.bastion_subnets) == 2 ? [local.bastion_subnets[1]] : [local.bastion_subnets[0]] - tags = local.tags - user_data = data.template_file.bastion_user_data.rendered - vpc_id = var.vpc_id - kms_encryption_enabled = var.kms_encryption_enabled - skip_iam_authorization_policy = var.skip_iam_authorization_policy - boot_volume_encryption_key = var.boot_volume_encryption_key - existing_kms_instance_guid = var.existing_kms_instance_guid -} diff --git a/modules/bootstrap/template_files.tf b/modules/bootstrap/template_files.tf deleted file mode 100644 index 9cb9ce11..00000000 --- a/modules/bootstrap/template_files.tf +++ /dev/null @@ -1,6 +0,0 @@ -data "template_file" "bastion_user_data" { - template = file("${path.module}/templates/bastion_user_data.tpl") - vars = { - ssh_public_key_content = module.ssh_key[0].public_key_content - } -} diff --git a/modules/ce_project/main.tf b/modules/ce_project/main.tf deleted file mode 100644 index 7a1ab98d..00000000 --- a/modules/ce_project/main.tf +++ /dev/null @@ -1,40 +0,0 @@ -provider "shell" { - sensitive_environment = { - IBM_CLOUD_API_KEY = var.ibmcloud_api_key - } - interpreter = ["/bin/bash", "-c"] - enable_parallelism = false -} - -resource "shell_script" "ce_project" { - count = var.solution == "hpc" ? 1 : 0 - lifecycle_commands { - create = "scripts/create-update-ce-project.sh" - update = "scripts/create-update-ce-project.sh" - delete = "scripts/delete-ce-project.sh" - } - working_directory = path.module - sensitive_environment = { - RESERVATION_ID = var.reservation_id - } - environment = { - REGION = var.region - RESOURCE_GROUP_ID = var.resource_group_id - } - triggers = { - # We actually always do delete/create, since "update" is not implemented. - # when_value_changed = var.region - # ... - } -} - -resource "null_resource" "print_ce_project_logs" { - count = var.solution == "hpc" ? 1 : 0 - provisioner "local-exec" { - command = "echo \"$LOG_OUTPUT\" | sed 's/\\(\\[[0-9]\\{8\\} [0-9]\\{2\\}:[0-9]\\{2\\}:[0-9]\\{2\\}\\]\\)/\\n\\1/g'" - working_dir = path.module - environment = { - LOG_OUTPUT = shell_script.ce_project[0].output["logs"] - } - } -} diff --git a/modules/ce_project/outputs.tf b/modules/ce_project/outputs.tf deleted file mode 100644 index 5b2c88ce..00000000 --- a/modules/ce_project/outputs.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "guid" { - description = "Code Engine Project GUID" - value = var.solution == "hpc" ? shell_script.ce_project[0].output["guid"] : "" -} diff --git a/modules/ce_project/scripts/check_reservation.sh b/modules/ce_project/scripts/check_reservation.sh deleted file mode 100755 index 86be8b90..00000000 --- a/modules/ce_project/scripts/check_reservation.sh +++ /dev/null @@ -1,540 +0,0 @@ -#!/bin/bash -# -# Licensed Materials - Property of IBM -# 5725-S00 (C) Copyright IBM Corp. 2024. All Rights Reserved. -# US Government Users Restricted Rights - Use, duplication or -# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. -IAM_ENDPOINT_URL="https://iam.cloud.ibm.com/identity/token" -RESOURCE_CONTROLLER_ENDPOINT_URL="https://resource-controller.cloud.ibm.com" -CODE_ENGINE_API_ENDPOINT_URL="https://api.REGION.codeengine.cloud.ibm.com" -V2_CONTEXT_ROOT="v2" -V2BETA_CONTEXT_ROOT="v2beta" -RESOURCE_PLAN_GUID="2e390ff1-fe87-458f-9a23-dfb6719509e1" - -TMP_DIR="/tmp" -HTTP_OUTPUT_FILE="${TMP_DIR}/hpcaas_http_output.log" - -REGION="" -RESOURCE_GROUP_ID="" - -LOG_FILE="/tmp/hpcaas-check-reservation.log" - -# Script return code: -# 0 - Success, a Reservation for the input RESERVATION_ID exists and a Code Engine Project exists for it. -# 1 - IBM_CLOUD_API_KEY and/or RESERVATION_ID environment variables are not provided. -# 2 - Parsing error, the script was not invoked correctly. -# 3 - Cannot retrieve JWT token, the script cannot exchange the IBM Cloud API key with a JWT token. -# 4 - Cannot retrieve a GUID for the input Reservation ID. -# 5 - Reservation doesn't exist, a Reservation for the input RESERVATION_ID doesn't. -# 6 - Cannot create the Code Engine project. -# 7 - Code Engine project creation timeout expired. -# 8 - Cannot associate the Code Engine project with guid GUID to the Reservation with id RESERVATION_ID. - -#################################################################################################### -# init_logging -# -# Description: -# this function initialize the hpcaas-check-reservation.log file -#################################################################################################### -init_logging() { - # Calculate the folder of the log file - LOG_FILE_FOLDER=$(dirname "${LOG_FILE}") - # Verify the folder exists, if not create it - if [ ! -d "${LOG_FILE_FOLDER}" ]; then - # Se non esiste, crea la directory - mkdir -p "${LOG_FILE_FOLDER}" - fi - # Remove everything from the log file - echo "" > "${LOG_FILE}" -} - -#################################################################################################### -# log -# -# Description: -# this function print the input message on a log file. -# Input: -# message, the message to print -# Output: -# message, the message with variable and timestamp rendered. -#################################################################################################### -log() { - local message=$1 - - # Create the timestamp to add in the log message - timestamp=$(date +'%Y%m%d %H:%M:%S') - # Print the message on the output and in the log file - echo "[$timestamp] ${message}" - echo "[$timestamp] ${message}" >> "${LOG_FILE}" -} - -#################################################################################################### -# usage -# -# Description: -# this function prints the usage and exit with an error -#################################################################################################### -usage() { - log "Usage: $0 [options]" - log "Options:" - log " --region id | -e id : Specify the Region" - log " --resource-group-id id | -e id : Specify the Resource Group ID" - log " [--output ] | -o [--output ] : Specify the log file. Default is stdout." - exit 2 -} - -#################################################################################################### -# parse_args -# -# Description: -# this function parse the input parameters. The following parameters are supported: -# --region id | -e id -# --resource-group-id id | -e id : Specify the Resource Group ID -# [--output ] | -o [--output ] : Specify the log file. Default is stdout -# Input: -# input parameters, the input parameters to parse -# Output: -# usage, the usage is printed if an error occured -#################################################################################################### -parse_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --region|-e) - shift - REGION="$1";; - --resource-group-id|-s) - shift - RESOURCE_GROUP_ID="$1";; - --output|-o) - shift - LOG_FILE="$1";; - *) - log "ERROR: parsing of the input arguments failed." - log "ERROR Details: invalid option $1." - usage;; - esac - shift - done - # Verify if the required options have been provided - if [[ -z "${REGION}" || -z "${RESOURCE_GROUP_ID}" ]]; then - log "ERROR: parsing of the input arguments failed." - log "ERROR Details: the options --region, and --resource-group-id are required." - usage - fi - - # Array contenente i valori consentiti per la regione - local allowed_regions=("us-east" "us-south" "eu-de") - - # Verifica se la regione specificata è tra i valori consentiti - # shellcheck disable=SC2199,SC2076 - if [[ ! " ${allowed_regions[@]} " =~ " ${REGION} " ]]; then - log "ERROR: parsing of the input arguments failed." - log "ERROR Details: Invalid region specified. Region must be one of: ${allowed_regions[*]}." - usage - fi -} - -#################################################################################################### -# get_token -# -# Description: -# this function validates the IBM Cloud API Key and return a JWT Baerer token to use for authentication -# when Code Engine API are invoked. This function takes in input the IBM Cloud API Key and return the -# JWT token. -# Input: -# api_key, the IBM Cloud API key that identify the IBM Cloud User account. -# Output: -# token, the JWT token if the function is successful -# http status, in case of failure the HTTP status is printed -# error message, in case of failure the error message is printed -# Return: -# 0, success -# 1, failure -#################################################################################################### -get_token() { - local api_key="$1" - local response - local http_status - local json_response - local token - local error_message - - # The IBM tool https://github.com/ibm/detect-secrets detected the secret we passed to the API. - # However, this is aa public secret so no real exposure exists. - - # This is the curl used to retrieve the JWT token given the IBM Cloud API Key in input - response=$(curl -s -w "%{http_code}" --request POST --url ${IAM_ENDPOINT_URL} --header 'Authorization: Basic Yng6Yng=' --header 'Content-Type: application/x-www-form-urlencoded' --data grant_type=urn:ibm:params:oauth:grant-type:apikey --data apikey="${api_key}") # pragma: allowlist secret - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_status="${response: -3}" - json_response=${response%???} - - # If HTTP Status = 200 the JWT token is printed and 0 is returned, otherwise - # 1 is printed (meaning error) and HTTP STATUS and error messages are printed. - # The reason for this is that if something goes wrong, the caller can print the HTTP STATUS - # code and the error messages so that the customer can understand the problem. - if [ "$http_status" -eq 200 ]; then - token=$(echo "$json_response" | jq -r '.access_token') - echo "$token" - return 0 - else - error_message=$(echo "$json_response" | jq -r '.errorMessage') - echo "$http_status" - echo "$error_message" - return 1 - fi -} - -#################################################################################################### -# get_guid_from_reservation_id -# -# Description: -# this function check if a Code Engine Project exists for the input reservation_id. If so, -# the function return with success, otherwise an error is returned. -# Input: -# jwt_token, the jwt token -# reservation_id, the reservation id to check -# Output: -# http_code, the HTTP code returned by Code Engine -# message, the HTTP message returned by Code Engine -# -# Return: -# 200 if everything is OK, otherwise an error code with relative message -#################################################################################################### -get_guid_from_reservation_id() { - local jwt_token="$1" - local result - local http_status - local response_message - - # This curl check if the input reservation id exists - result=$(curl -s -w "%{http_code}" -o ${HTTP_OUTPUT_FILE} \ - -H "Authorization: Bearer ${jwt_token}" \ - "${CODE_ENGINE_API_ENDPOINT_URL}/${V2BETA_CONTEXT_ROOT}/capacity_reservations") - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_status="${result: -3}" - response_message=$(cat "${HTTP_OUTPUT_FILE}") - - # Show both the HTTP code and the response message - echo "${http_status}" - echo "${response_message}" -} - -#################################################################################################### -# check_reservation -# -# Description: -# this function check if a Code Engine Project exists for the input reservation_id. If so, -# the function return with success, otherwise an error is returned. -# Input: -# jwt_token, the jwt token -# reservation_guid, the reservation guid to check -# Output: -# http_code, the HTTP code returned by Code Engine -# message, the HTTP message returned by Code Engine -# -# Return: -# 200 if everything is OK, otherwise an error code with relative message -#################################################################################################### -check_reservation() { - local jwt_token="$1" - local reservation_guid="$2" - local result - local http_status - local response_message - - # This curl check if the input reservation id exists - result=$(curl -s -w "%{http_code}" -o ${HTTP_OUTPUT_FILE} \ - -H "Authorization: Bearer ${jwt_token}" \ - "${CODE_ENGINE_API_ENDPOINT_URL}/${V2BETA_CONTEXT_ROOT}/capacity_reservations/${reservation_guid}") - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_status="${result: -3}" - response_message=$(cat "${HTTP_OUTPUT_FILE}") - - # Show both the HTTP code and the response message - echo "${http_status}" - echo "${response_message}" -} - -#################################################################################################### -# create_ce_project -# -# Description: -# this function creates a Code Engine Project. -# Input: -# jwt_token, the jwt token -# region, the region -# resource_group_id, the resource group id -# Output: -# http_code, the HTTP code returned by Code Engine -# message, the HTTP message returned by Code Engine -# Return: -# 201 or 202 if everything is OK, otherwise an error code with relative message -#################################################################################################### -create_ce_project() { - local jwt_token="$1" - local region="$2" - local resource_group_id="$3" - local timestamp - local project_name - local parameters - local allow_cleanup - local result - local http_code - local response_message - - timestamp=$(date "+%Y%m%d%H%M%S") - project_name="HPC-Default-${timestamp}" - parameters='{"name":"'"${project_name}"'","profile":"hpc"}' - allow_cleanup=false - - # This curl create an empty Code Engine project via Resource Controller - result=$(curl -s -w "%{http_code}" -o ${HTTP_OUTPUT_FILE} \ - -X POST \ - -H "Authorization: Bearer ${jwt_token}" \ - -H "Content-Type: application/json" \ - -d "{\"name\":\"${project_name}\",\"resource_plan_id\":\"${RESOURCE_PLAN_GUID}\",\"resource_group\":\"${resource_group_id}\",\"parameters\":${parameters},\"target\":\"${region}\",\"allow_cleanup\":${allow_cleanup}}" \ - "${RESOURCE_CONTROLLER_ENDPOINT_URL}/${V2_CONTEXT_ROOT}/resource_instances") - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_code="${result: -3}" - response_message=$(cat "${HTTP_OUTPUT_FILE}") - - # Show both the HTTP code and the response message - echo "$http_code" - echo "$response_message" -} - -#################################################################################################### -# wait_ce_project_creation -# -# Description: -# this function waits the Code Engine Project was successfully created. -# Input: -# guid, the Code Engine project guid -# Return: -# 0, successful -# 1, timeout expired -#################################################################################################### -wait_ce_project_creation() { - local jwt_token="$1" - local region="$2" - local ce_project_guid="$3" - # 3 minutes and 20s timeout - local timeout=300 - local start_time - local http_code - local response_message - local status - local current_time - local elapsed_time - local result - - start_time=$(date +%s) - # Loop until the Code Engine project is ready or the timeout expired - while true; do - # Check if the Code Engine project is ready - result=$(curl -s -w "%{http_code}" -o ${HTTP_OUTPUT_FILE} \ - -H "Authorization: Bearer ${jwt_token}" \ - "${CODE_ENGINE_API_ENDPOINT_URL}/${V2_CONTEXT_ROOT}/projects/${ce_project_guid}") - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_code="${result: -3}" - response_message=$(cat "${HTTP_OUTPUT_FILE}") - - # If the Code Engine project is ready, return - if [ "$http_code" -eq 200 ]; then - status=$(jq -r '.status' "${HTTP_OUTPUT_FILE}") - - # If status is not active exit from this cycle, Code Engine API returns this status when the project is ready - if [ "$status" == "active" ]; then - return 0 - fi - fi - - # Check if the timeout expired - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [ "$elapsed_time" -ge "$timeout" ]; then - break - fi - - # Wait 10 seconds before retry the check. - sleep 10 - done - - # The Code Engine project wasn't successfully created, the timeout expired, so return error - return 1 -} - -#################################################################################################### -# associate_ce_project_to_reservation -# -# Description: -# this function associates the Code Engine project to the HPC Reservation -# Input: -# guid, the Code Engine project guid -# Return: -# 200 if everything is OK, otherwise an error code with relative message -#################################################################################################### -associate_ce_project_to_reservation() { - local jwt_token="$1" - local region="$2" - local ce_project_guid="$3" - local reservation_guid="$4" - local http_code - local response_message - - # This Code Engine API associate the Reservation ID to the Code Engine project previously created - result=$(curl -s -w "%{http_code}" -o "${HTTP_OUTPUT_FILE}" \ - -X PATCH \ - -H "Authorization: Bearer ${jwt_token}" \ - -H "Content-Type: application/json" \ - -d "{\"project_id\":\"${ce_project_guid}\"}" \ - "${CODE_ENGINE_API_ENDPOINT_URL}/${V2BETA_CONTEXT_ROOT}/projects/${ce_project_guid}/capacity_reservations/${reservation_guid}") - - # The curl return a reply with the following format { ... JSON ... }HTTPSTATUS. - # These two lines separate the HTTP STATUS from the JSON reply. - http_code="${result: -3}" - response_message=$(cat "${HTTP_OUTPUT_FILE}") - - # Show both the HTTP code and the response message - echo "${http_code}" - echo "${response_message}" -} - -#################################################################################################### -# Main program -#################################################################################################### -# First of all, let's parse the input parameters so that we have the input variables to work on. -# This parsing will populate the globa variables: -# - RESERVATION_ID -# - REGION -# - RESOURCE_GROUP_ID -# - LOG_FILE -parse_args "$@" -# Initialize the logging file -init_logging - -# The IBM tool https://github.com/ibm/detect-secrets detected a secret keyword. -# However, it detected only the API keyword but no real secret expure exists here. -if [ -z "$IBM_CLOUD_API_KEY" ]; then # pragma: allowlist secret - log "ERROR: environment variable IBM_CLOUD_API_KEY not provided. Run the command:" - log " export IBM_CLOUD_API_KEY=\"\"" # pragma: allowlist secret - exit 1 -fi - -if [ -z "$RESERVATION_ID" ]; then - log "ERROR: environment variable RESERVATION_ID not provided. Run the command:" - log " export RESERVATION_ID=\"\"" - exit 1 -fi - -# Since I have now the value for the REGION variable I can set correctly the: -# - Code Engine API Endpoint URL correctly -# - HPC API Endpoint URL correctly -CODE_ENGINE_API_ENDPOINT_URL=${CODE_ENGINE_API_ENDPOINT_URL//REGION/${REGION}} - -# Try to exchange the IBM Cloud API key for a JWT token. -log "INFO: Retrieving the JWT Token for the IBM_CLOUD_API_KEY." -if ! JWT_TOKEN=$(get_token "${IBM_CLOUD_API_KEY}"); then - HTTP_STATUS=$(echo "${JWT_TOKEN}" | head -n 1) - ERROR_MESSAGE=$(echo "${JWT_TOKEN}" | tail -n 1) - log "ERROR: cannot retrieve JWT token. HTTP Status ${HTTP_STATUS}. ${ERROR_MESSAGE}" - exit 3 -fi - -# HPC Tile has the parameter RESERVATION_ID that is meaningful name like Contract-IBM-WDC-OB -# As first step, we need to get the RESERVATION_GUID starting from the RESERVATION_ID. To do that, -# we retrieve a JSOn file contaaining the list of all the reservations. -log "INFO: Getting the Reservation GUID starting from the ID ." -response=$(get_guid_from_reservation_id "${JWT_TOKEN}") -http_code=$(echo "${response}" | head -n 1) -response_message=$(echo "${response}" | tail -n +2) - -# Check if the RESERVATION_GUID is available -if [ "${http_code}" != "200" ]; then - log "ERROR: Reservation GUID for the ID , wasn't found." - log "ERROR Details: ${response_message}." - exit 4 -fi - -# Now we have to check if in the JSON list exists a reservation equal to RESERVATION_ID -RESERVATION_GUID=$(echo "${response_message}" | jq -r ".capacity_reservations[] | select (.name == \"${RESERVATION_ID}\") | .id") -if [ -z "$RESERVATION_GUID" ]; then - log "ERROR: Reservation GUID for the ID , wasn't found." - exit 4 -fi - -# We found the RESERVATION_GUID associated with the RESERVATION_ID -log "INFO: Reservation (ID: ) has the GUID: ${RESERVATION_GUID}." - -# Now we have to check if the reservation RESERVATION_ID has a Code Engine Project exists for it. -log "INFO: Verifying the existence of a Reservation (GUID: ${RESERVATION_GUID})." -response=$(check_reservation "${JWT_TOKEN}" "${RESERVATION_GUID}") -http_code=$(echo "${response}" | head -n 1) -response_message=$(echo "${response}" | tail -n +2) - -# Check if the a Code Engine Project relative to the RESERVATION ID was found in Code Engine -if [ "${http_code}" != "200" ]; then - log "ERROR: Reservation with GUID ${RESERVATION_GUID}, wasn't found." - log "ERROR Details: ${response_message}." - exit 5 -fi - -# A Reservation with id RESERVATION_ID exists. We need to verify that a Code Engine project exists. -log "INFO: Verifying if the Reservation (GUID: ${RESERVATION_GUID}) is associated with a Code Engine project." - -# Check if a project_id exists in the response_message -CODE_ENGINE_PROJECT_GUID=$(echo "${response_message}" | jq -e -r '.project_id // empty') -if [ -n "${CODE_ENGINE_PROJECT_GUID}" ]; then - log "INFO: Reservation (GUID: ${RESERVATION_GUID}) exists and is associated with the Code Engine project (ID: ${CODE_ENGINE_PROJECT_GUID})." - log "INFO: CODE_ENGINE_PROJECT_GUID=${CODE_ENGINE_PROJECT_GUID}" - log "INFO: ${0} successfully completed." - exit 0 -fi - -# A Reservation with id RESERVATION_ID exists but a Code Engine project doesn't, we need to create it. -log "INFO: No Code Engine project is associated with the Reservation (GUID: ${RESERVATION_GUID}). Initiating project creation." -response=$(create_ce_project "${JWT_TOKEN}" "${REGION}" "${RESOURCE_GROUP_ID}") -http_code=$(echo "${response}" | head -n 1) -response_message=$(echo "${response}" | tail -n +2) - -# Check if the a Code Engine Project has been created -if [ "${http_code}" != "201" ] && [ "${http_code}" != "202" ]; then - log "ERROR: Cannot create a Code Engine project." - log "ERROR Details: ${response_message}." - exit 6 -fi - -# If Code Engine project has been created, wait for its completion -CODE_ENGINE_PROJECT_GUID=$(echo "${response_message}" | jq -e -r '.guid') -log "INFO: Code Engine project (GUID: ${CODE_ENGINE_PROJECT_GUID}) for the Reservation (GUID: ${RESERVATION_GUID}) has been created. Waiting for its completion." -if ! wait_ce_project_creation "${JWT_TOKEN}" "${REGION}" "${CODE_ENGINE_PROJECT_GUID}"; then - log "ERROR: Code Engine project creation timeout expired." - exit 7 -fi - -# We can associate the Code Engine project id to the Reservation -log "INFO: Code Engine project (GUID: ${CODE_ENGINE_PROJECT_GUID}) is going to be associated to the Reservation with GUID ${RESERVATION_GUID}." -response=$(associate_ce_project_to_reservation "${JWT_TOKEN}" "${REGION}" "${CODE_ENGINE_PROJECT_GUID}" "${RESERVATION_GUID}") -http_code=$(echo "${response}" | head -n 1) -response_message=$(echo "${response}" | tail -n +2) - -# Check if the a Code Engine Project has been created -if [ "${http_code}" != "200" ]; then - log "ERROR: Cannot associate the Code Engine project with guid ${CODE_ENGINE_PROJECT_GUID} to the Reservation with GUID ${RESERVATION_GUID}." - log "ERROR Details: ${response_message}." - exit 8 -fi - -log "INFO: Code Engine project (GUID: ${CODE_ENGINE_PROJECT_GUID}) has been successfully associated to the Reservation with GUID ${RESERVATION_GUID}." -log "INFO: CODE_ENGINE_PROJECT_GUID=${CODE_ENGINE_PROJECT_GUID}" -log "INFO: ${0} successfully completed." diff --git a/modules/ce_project/scripts/create-update-ce-project.sh b/modules/ce_project/scripts/create-update-ce-project.sh deleted file mode 100755 index bd69ae06..00000000 --- a/modules/ce_project/scripts/create-update-ce-project.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# -# Licensed Materials - Property of IBM -# 5725-S00 (C) Copyright IBM Corp. 2024. All Rights Reserved. -# US Government Users Restricted Rights - Use, duplication or -# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. -SCRIPT_DIR=$(realpath "$(dirname "$0")") -TMP_DIR="/tmp" -LOG_FILE="${TMP_DIR}/hpcaas-check-reservation.log" -CODE_ENGINE_PROJECT_GUID="" - -LOG_OUTPUT=$("${SCRIPT_DIR}"/check_reservation.sh --region "${REGION}" --resource-group-id "${RESOURCE_GROUP_ID}" --output "${LOG_FILE}" 2>&1) -RETURN_CODE=$? - -if [ ${RETURN_CODE} -eq 0 ]; then - # Estract the row containing CODE_ENGINE_PROJECT_GUID - GUID_LINE=$(echo "${LOG_OUTPUT}" | grep "CODE_ENGINE_PROJECT_GUID") - # If that line exists, extract the CE Project GUID - if [ -n "${GUID_LINE}" ]; then - CODE_ENGINE_PROJECT_GUID=$(echo "$GUID_LINE" | awk -F'=' '{print $2}') - fi -fi - -JSON_OUTPUT=$(printf '{ - "guid": "%s", - "logs": "%s" -}' "${CODE_ENGINE_PROJECT_GUID}" "${LOG_OUTPUT}") - -echo "${JSON_OUTPUT}" -exit ${RETURN_CODE} diff --git a/modules/ce_project/scripts/delete-ce-project.sh b/modules/ce_project/scripts/delete-ce-project.sh deleted file mode 100755 index 90f09b2e..00000000 --- a/modules/ce_project/scripts/delete-ce-project.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -# We do not need this. diff --git a/modules/ce_project/variables.tf b/modules/ce_project/variables.tf deleted file mode 100644 index 07a0dcce..00000000 --- a/modules/ce_project/variables.tf +++ /dev/null @@ -1,30 +0,0 @@ -variable "ibmcloud_api_key" { - description = "IBM Cloud API key for the IBM Cloud account where the IBM Cloud HPC cluster needs to be deployed. For more information on how to create an API key, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." - type = string - sensitive = true - default = null -} - -variable "reservation_id" { - type = string - sensitive = true - description = "Ensure that you have received the reservation ID from IBM technical sales. Reservation ID is a unique identifier to distinguish different IBM Cloud HPC service agreements. It must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (_)." - default = null -} - -variable "region" { - description = "The region where the Code Engine project must be instantiated" - type = string -} - -variable "resource_group_id" { - description = "String describing resource groups to create or reference" - type = string - default = null -} - -variable "solution" { - type = string - default = "hpc" - description = "This is required to define a specific solution for the creation of reservation id's" -} diff --git a/modules/ce_project/version.tf b/modules/ce_project/version.tf deleted file mode 100644 index 15ed1442..00000000 --- a/modules/ce_project/version.tf +++ /dev/null @@ -1,13 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - shell = { - source = "scottwinkler/shell" - version = "1.7.10" - } - null = { - source = "hashicorp/null" - version = ">= 3.0.0" - } - } -} diff --git a/modules/common/client_configuration/client_configuration.tf b/modules/common/client_configuration/client_configuration.tf new file mode 100644 index 00000000..fe3ad0bb --- /dev/null +++ b/modules/common/client_configuration/client_configuration.tf @@ -0,0 +1,42 @@ +resource "local_sensitive_file" "write_client_meta_private_key" { + count = (tobool(var.turn_on) == true && tobool(var.write_inventory_complete) == true) ? 1 : 0 + content = var.client_meta_private_key + filename = local.client_private_key + file_permission = "0600" +} + +resource "null_resource" "prepare_client_inventory_using_jumphost_connection" { + count = (tobool(var.turn_on) == true && tobool(var.storage_cluster_create_complete) == true && tobool(var.using_jumphost_connection) == true && tobool(var.create_scale_cluster) == true) ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "python3 ${local.ansible_inv_script_path} --client_tf_inv_path ${var.client_inventory_path} --install_infra_path ${var.clone_path} --instance_private_key ${local.client_private_key} --bastion_user ${var.bastion_user} --bastion_ip ${var.bastion_instance_public_ip} --bastion_ssh_private_key ${var.bastion_ssh_private_key} --enable_ldap ${var.enable_ldap} --ldap_basedns ${var.ldap_basedns} --ldap_server ${var.ldap_server} --ldap_admin_password ${var.ldap_admin_password}" + } + triggers = { + build = timestamp() + } + depends_on = [resource.local_sensitive_file.write_client_meta_private_key] +} + +resource "null_resource" "prepare_client_inventory" { + count = (tobool(var.turn_on) == true && tobool(var.storage_cluster_create_complete) == true && tobool(var.using_jumphost_connection) == false && tobool(var.create_scale_cluster) == true) ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "python3 ${local.ansible_inv_script_path} --client_tf_inv_path ${var.client_inventory_path} --install_infra_path ${var.clone_path} --instance_private_key ${local.client_private_key} --enable_ldap ${var.enable_ldap} --ldap_basedns ${var.ldap_basedns} --ldap_server ${var.ldap_server} --ldap_admin_password ${var.ldap_admin_password}" + } + triggers = { + build = timestamp() + } + depends_on = [resource.local_sensitive_file.write_client_meta_private_key] +} + +resource "null_resource" "perform_client_configuration" { + count = (tobool(var.turn_on) == true && tobool(var.storage_cluster_create_complete) == true && tobool(var.create_scale_cluster) == true) ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "ansible-playbook -i ${local.client_inventory_path} ${local.client_playbook}" + } + triggers = { + build = timestamp() + } + depends_on = [resource.local_sensitive_file.write_client_meta_private_key, resource.null_resource.prepare_client_inventory_using_jumphost_connection, resource.null_resource.prepare_client_inventory] +} diff --git a/modules/common/client_configuration/locals.tf b/modules/common/client_configuration/locals.tf new file mode 100644 index 00000000..58b2457d --- /dev/null +++ b/modules/common/client_configuration/locals.tf @@ -0,0 +1,8 @@ + +locals { + client_inventory_path = format("%s/%s/client_inventory.ini", var.clone_path, "ibm-spectrum-scale-install-infra") + client_playbook = format("%s/%s/client_cloud_playbook.yaml", var.clone_path, "ibm-spectrum-scale-install-infra") + scripts_path = replace(path.module, "client_configuration", "scripts") + ansible_inv_script_path = format("%s/prepare_client_inv.py", local.scripts_path) + client_private_key = format("%s/client_key/id_rsa", var.clone_path) +} diff --git a/modules/my_ip/main.tf b/modules/common/client_configuration/main.tf similarity index 100% rename from modules/my_ip/main.tf rename to modules/common/client_configuration/main.tf diff --git a/modules/common/client_configuration/outputs.tf b/modules/common/client_configuration/outputs.tf new file mode 100644 index 00000000..5e172c32 --- /dev/null +++ b/modules/common/client_configuration/outputs.tf @@ -0,0 +1,5 @@ +output "client_create_complete" { + value = true + description = "Client cluster create complete" + depends_on = [resource.local_sensitive_file.write_client_meta_private_key, resource.null_resource.prepare_client_inventory_using_jumphost_connection, resource.null_resource.prepare_client_inventory, resource.null_resource.perform_client_configuration] +} diff --git a/modules/common/client_configuration/variables.tf b/modules/common/client_configuration/variables.tf new file mode 100644 index 00000000..b26a31d4 --- /dev/null +++ b/modules/common/client_configuration/variables.tf @@ -0,0 +1,75 @@ +variable "turn_on" { + type = string + description = "It is used to turn on the null resources based on conditions." +} + +variable "write_inventory_complete" { + type = string + description = "It is used to confirm inventory file written is completed." +} + +variable "create_scale_cluster" { + type = string + description = "It enables scale cluster configuration." +} + +variable "clone_path" { + type = string + description = "Scale repo clone path" +} + +variable "using_jumphost_connection" { + type = bool + description = "If true, will skip the jump/bastion host configuration." +} + +variable "bastion_user" { + type = string + description = "Provide the username for Bastion login." +} + +variable "bastion_instance_public_ip" { + type = string + description = "Bastion instance public ip address." +} + +variable "bastion_ssh_private_key" { + type = string + description = "Bastion SSH private key path, which will be used to login to bastion host." +} + +variable "enable_ldap" { + type = bool + description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." +} + +variable "ldap_basedns" { + type = string + description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." +} + +variable "ldap_server" { + type = string + description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." +} + +variable "ldap_admin_password" { + type = string + sensitive = true + description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required. It is important to avoid including the username in the password for enhanced security." +} + +variable "storage_cluster_create_complete" { + type = bool + description = "Storage cluster crete complete" +} + +variable "client_inventory_path" { + type = string + description = "Client inventory path" +} + +variable "client_meta_private_key" { + type = string + description = "Client SSH private key path, which will be used to login to client host." +} diff --git a/modules/common/client_configuration/version.tf b/modules/common/client_configuration/version.tf new file mode 100644 index 00000000..4ba00afc --- /dev/null +++ b/modules/common/client_configuration/version.tf @@ -0,0 +1,18 @@ +############################################################################## +# Terraform Providers +############################################################################## + +terraform { + required_version = ">= 1.9.0" + # Use "greater than or equal to" range for root level modules + required_providers { + local = { + source = "hashicorp/local" + version = "~> 2" + } + null = { + source = "hashicorp/null" + version = ">= 3.0.0" + } + } +} diff --git a/modules/common/compute_configuration/compute_configuration.tf b/modules/common/compute_configuration/compute_configuration.tf new file mode 100644 index 00000000..49a273a4 --- /dev/null +++ b/modules/common/compute_configuration/compute_configuration.tf @@ -0,0 +1,105 @@ +/* + Excutes ansible playbook to install IBM Spectrum Scale compute cluster. +*/ + +resource "local_file" "create_compute_tuning_parameters" { + count = (tobool(var.turn_on) == true && tobool(var.write_inventory_complete) == true) ? 1 : 0 + content = < 1: + # Storage/NSD nodes to be quorum nodes (quorum_count - 2 as index starts from 0) + start_quorum_assign = quorum_count - 2 + else: + # Storage/NSD nodes to be quorum nodes (quorum_count - 1 as index starts from 0) + start_quorum_assign = quorum_count - 1 + + for each_ip in storage_cluster_instance_names: + if storage_cluster_instance_names.index(each_ip) <= ( + start_quorum_assign + ) and storage_cluster_instance_names.index(each_ip) <= (manager_count - 1): + if storage_cluster_instance_names.index(each_ip) == 0: + node = { + "ip_addr": each_ip, + "is_quorum": True, + "is_manager": True, + "is_gui": True, + "is_collector": True, + "is_nsd": True, + "is_admin": True, + "user": user, + "key_file": key_file, + "class": "storagenodegrp", + } + elif storage_cluster_instance_names.index(each_ip) == 1: + node = { + "ip_addr": each_ip, + "is_quorum": True, + "is_manager": True, + "is_gui": False, + "is_collector": True, + "is_nsd": True, + "is_admin": True, + "user": user, + "key_file": key_file, + "class": "storagenodegrp", + } + else: + node = { + "ip_addr": each_ip, + "is_quorum": True, + "is_manager": True, + "is_gui": False, + "is_collector": False, + "is_nsd": True, + "is_admin": True, + "user": user, + "key_file": key_file, + "class": "storagenodegrp", + } + elif storage_cluster_instance_names.index(each_ip) <= ( + start_quorum_assign + ) and storage_cluster_instance_names.index(each_ip) > (manager_count - 1): + node = { + "ip_addr": each_ip, + "is_quorum": True, + "is_manager": False, + "is_gui": False, + "is_collector": False, + "is_nsd": True, + "is_admin": True, + "user": user, + "key_file": key_file, + "class": "storagenodegrp", + } + else: + node = { + "ip_addr": each_ip, + "is_quorum": False, + "is_manager": False, + "is_gui": False, + "is_collector": False, + "is_nsd": True, + "is_admin": False, + "user": user, + "key_file": key_file, + "class": "storagenodegrp", + } + node_details.append(get_host_format(node)) + + if az_count > 1: + if len(storage_private_ips) - len(desc_private_ips) >= quorum_count: + quorums_left = 0 + else: + quorums_left = ( + quorum_count - len(storage_private_ips) - len(desc_private_ips) + ) + else: + if len(storage_private_ips) > quorum_count: + quorums_left = 0 + else: + quorums_left = quorum_count - len(storage_private_ips) + + # Additional quorums assign to compute nodes + if quorums_left > 0: + for each_ip in compute_cluster_instance_names[0:quorums_left]: + node = { + "ip_addr": each_ip, + "is_quorum": True, + "is_manager": False, + "is_gui": False, + "is_collector": False, + "is_nsd": False, + "is_admin": True, + "user": user, + "key_file": key_file, + "class": "computenodegrp", + } + node_details.append(get_host_format(node)) + for each_ip in compute_cluster_instance_names[quorums_left:]: + node = { + "ip_addr": each_ip, + "is_quorum": False, + "is_manager": False, + "is_gui": False, + "is_collector": False, + "is_nsd": False, + "is_admin": False, + "user": user, + "key_file": key_file, + "class": "computenodegrp", + } + node_details.append(get_host_format(node)) + + if quorums_left == 0: + for each_ip in compute_cluster_instance_names: + node = { + "ip_addr": each_ip, + "is_quorum": False, + "is_manager": False, + "is_gui": False, + "is_collector": False, + "is_nsd": False, + "is_admin": False, + "user": user, + "key_file": key_file, + "class": "computenodegrp", + } + node_details.append(get_host_format(node)) + return node_details + + +def initialize_scale_config_details(list_nodclass_param_dict): + """Initialize scale cluster config details. + :args: node_class (list), comp_nodeclass_config (dict), mgmt_nodeclass_config (dict), strg_desc_nodeclass_config (dict), strg_nodeclass_config (dict), proto_nodeclass_config (dict), strg_proto_nodeclass_config (dict) + """ + scale_config = {} + scale_config["scale_config"], scale_config["scale_cluster_config"] = [], {} + + for param_dicts in list_nodclass_param_dict: + if param_dicts[1] != {}: + scale_config["scale_config"].append( + { + "nodeclass": list(param_dicts[0].values())[0], + "params": [param_dicts[1]], + } + ) + + scale_config["scale_cluster_config"]["ephemeral_port_range"] = "60000-61000" + return scale_config + + +def get_disks_list(az_count, disk_mapping, desc_disk_mapping, disk_type): + """Initialize disk list.""" + disks_list = [] + if disk_type == "locally-attached": + failureGroup = 0 + for each_ip, disk_per_ip in disk_mapping.items(): + failureGroup = failureGroup + 1 + for each_disk in disk_per_ip: + disks_list.append( + { + "device": each_disk, + "failureGroup": failureGroup, + "servers": each_ip, + "usage": "dataAndMetadata", + "pool": "system", + } + ) + + # Map storage nodes to failure groups based on AZ and subnet variations + else: + failure_group1, failure_group2 = [], [] + if az_count == 1: + # Single AZ, just split list equally + failure_group1 = [ + key for index, key in enumerate(disk_mapping) if index % 2 == 0 + ] + failure_group2 = [ + key for index, key in enumerate(disk_mapping) if index % 2 != 0 + ] + else: + # Multi AZ, split based on subnet match + subnet_pattern = re.compile(r"\d{1,3}\.\d{1,3}\.(\d{1,3})\.\d{1,3}") + subnet1A = subnet_pattern.findall(list(disk_mapping)[0]) + for each_ip in disk_mapping: + current_subnet = subnet_pattern.findall(each_ip) + if current_subnet[0] == subnet1A[0]: + failure_group1.append(each_ip) + else: + failure_group2.append(each_ip) + + storage_instances = [] + max_len = max(len(failure_group1), len(failure_group2)) + idx = 0 + while idx < max_len: + if idx < len(failure_group1): + storage_instances.append(failure_group1[idx]) + + if idx < len(failure_group2): + storage_instances.append(failure_group2[idx]) + + idx = idx + 1 + + for each_ip, disk_per_ip in disk_mapping.items(): + if each_ip in failure_group1: + for each_disk in disk_per_ip: + disks_list.append( + { + "device": each_disk, + "failureGroup": 1, + "servers": each_ip, + "usage": "dataAndMetadata", + "pool": "system", + } + ) + if each_ip in failure_group2: + for each_disk in disk_per_ip: + disks_list.append( + { + "device": each_disk, + "failureGroup": 2, + "servers": each_ip, + "usage": "dataAndMetadata", + "pool": "system", + } + ) + + # Append "descOnly" disk details + if len(desc_disk_mapping.keys()): + disks_list.append( + { + "device": list(desc_disk_mapping.values())[0][0], + "failureGroup": 3, + "servers": list(desc_disk_mapping.keys())[0], + "usage": "descOnly", + "pool": "system", + } + ) + return disks_list + + +def initialize_scale_storage_details( + az_count, + fs_mount, + block_size, + disk_details, + default_metadata_replicas, + max_metadata_replicas, + default_data_replicas, + max_data_replicas, + filesets, +): + """Initialize storage details. + :args: az_count (int), fs_mount (string), block_size (string), + disks_list (list), filesets (dictionary) + """ + filesets_name_size = {key.split("/")[-1]: value for key, value in filesets.items()} + + storage = {} + storage["scale_storage"] = [] + if not default_data_replicas: + if az_count > 1: + default_data_replicas = 2 + default_metadata_replicas = 2 + else: + default_data_replicas = 1 + default_metadata_replicas = 2 + + storage["scale_storage"].append( + { + "filesystem": pathlib.PurePath(fs_mount).name, + "blockSize": block_size, + "defaultDataReplicas": default_data_replicas, + "defaultMetadataReplicas": default_metadata_replicas, + "maxDataReplicas": max_data_replicas, + "maxMetadataReplicas": max_metadata_replicas, + "automaticMountOption": "true", + "defaultMountPoint": fs_mount, + "disks": disk_details, + "filesets": filesets_name_size, + } + ) + return storage + + +def initialize_scale_ces_details( + smb, + nfs, + object, + export_ip_pool, + filesystem, + mountpoint, + filesets, + protocol_cluster_instance_names, + enable_ces, +): + """Initialize ces details. + :args: smb (bool), nfs (bool), object (bool), + export_ip_pool (list), filesystem (string), mountpoint (string) + """ + exports = [] + export_node_ip_map = [] + if enable_ces == "True": + filesets_name_size = { + key.split("/")[-1]: value for key, value in filesets.items() + } + exports = list(filesets_name_size.keys()) + + # Creating map of CES nodes and it Ips + export_node_ip_map = [ + {protocol_cluster_instance_name.split(".")[0]: ip} + for protocol_cluster_instance_name, ip in zip( + protocol_cluster_instance_names, export_ip_pool + ) + ] + + ces = { + "scale_protocols": { + "nfs": nfs, + "object": object, + "smb": smb, + "export_node_ip_map": export_node_ip_map, + "filesystem": filesystem, + "mountpoint": mountpoint, + "exports": exports, + } + } + return ces + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description="Convert terraform inventory " + "to ansible inventory format " + "install and configuration." + ) + PARSER.add_argument( + "--tf_inv_path", required=True, help="Terraform inventory file path" + ) + PARSER.add_argument( + "--install_infra_path", + required=True, + help="Spectrum Scale install infra clone parent path", + ) + PARSER.add_argument( + "--instance_private_key", + required=True, + help="Spectrum Scale instances SSH private key path", + ) + PARSER.add_argument("--bastion_user", help="Bastion OS Login username") + PARSER.add_argument("--bastion_ip", help="Bastion SSH public ip address") + PARSER.add_argument( + "--bastion_ssh_private_key", help="Bastion SSH private key path" + ) + PARSER.add_argument("--memory_size", help="Instance memory size") + PARSER.add_argument( + "--max_pagepool_gb", help="maximum pagepool size in GB", default=1 + ) + PARSER.add_argument("--disk_type", help="Disk type") + PARSER.add_argument( + "--default_data_replicas", help="Value for default data replica" + ) + PARSER.add_argument("--max_data_replicas", help="Value for max data replica") + PARSER.add_argument( + "--default_metadata_replicas", help="Value for default metadata replica" + ) + PARSER.add_argument( + "--max_metadata_replicas", help="Value for max metadata replica" + ) + PARSER.add_argument("--using_packer_image", help="skips gpfs rpm copy") + PARSER.add_argument("--using_rest_initialization", help="skips gui configuration") + PARSER.add_argument( + "--gui_username", required=True, help="Spectrum Scale GUI username" + ) + PARSER.add_argument( + "--gui_password", required=True, help="Spectrum Scale GUI password" + ) + PARSER.add_argument( + "--enable_mrot_conf", required=True, help="Configure MROT and Logical Subnet" + ) + PARSER.add_argument( + "--enable_ces", required=True, help="Configure CES on protocol nodes" + ) + PARSER.add_argument("--verbose", action="store_true", help="print log messages") + PARSER.add_argument( + "--scale_encryption_servers", + help="List of key servers for encryption", + default=[], + ) + PARSER.add_argument( + "--scale_encryption_admin_password", + help="Admin Password for the Key server", + default="null", + ) + PARSER.add_argument( + "--scale_encryption_type", + help="Encryption type should be either GKLM or Key_Protect", + default="null", + ) + PARSER.add_argument( + "--scale_encryption_enabled", help="Enabling encryption feature", default=False + ) + PARSER.add_argument("--enable_ldap", help="Enabling the LDAP", default=False) + PARSER.add_argument("--ldap_basedns", help="Base domain of ldap", default="null") + PARSER.add_argument("--ldap_server", help="LDAP Server IP", default="null") + PARSER.add_argument( + "--ldap_admin_password", help="LDAP Admin Password", default="null" + ) + PARSER.add_argument( + "--colocate_protocol_instances", + help="It checks if colocation is enabled", + default=False, + ) + PARSER.add_argument( + "--is_colocate_protocol_subset", + help="It checks if protocol node count is less than storage NSD node count", + default=False, + ) + PARSER.add_argument("--comp_memory", help="Compute node memory", default=32) + PARSER.add_argument( + "--comp_vcpus_count", help="Compute node vcpus count", default=8 + ) + PARSER.add_argument( + "--comp_bandwidth", help="Compute node bandwidth", default=16000 + ) + PARSER.add_argument("--mgmt_memory", help="Management node memory", default=32) + PARSER.add_argument( + "--mgmt_vcpus_count", help="Management node vcpus count", default=8 + ) + PARSER.add_argument( + "--mgmt_bandwidth", help="Management node bandwidth", default=16000 + ) + PARSER.add_argument( + "--strg_desc_memory", help="Tie breaker node memory", default=32 + ) + PARSER.add_argument( + "--strg_desc_vcpus_count", help="Tie breaker node vcpus count", default=8 + ) + PARSER.add_argument( + "--strg_desc_bandwidth", help="Tie breaker node bandwidth", default=16000 + ) + PARSER.add_argument("--strg_memory", help="Storage NDS node memory", default=32) + PARSER.add_argument( + "--strg_vcpus_count", help="Storage NDS node vcpuscount", default=8 + ) + PARSER.add_argument( + "--strg_bandwidth", help="Storage NDS node bandwidth", default=16000 + ) + PARSER.add_argument("--proto_memory", help="Protocol node memory", default=32) + PARSER.add_argument( + "--proto_vcpus_count", help="Protocol node vcpus count", default=8 + ) + PARSER.add_argument( + "--proto_bandwidth", help="Protocol node bandwidth", default=16000 + ) + PARSER.add_argument( + "--strg_proto_memory", help="Storage protocol node memory", default=32 + ) + PARSER.add_argument( + "--strg_proto_vcpus_count", help="Storage protocol node vcpus count", default=8 + ) + PARSER.add_argument( + "--strg_proto_bandwidth", help="Storage protocol node bandwidth", default=16000 + ) + PARSER.add_argument("--enable_afm", help="enable AFM", default="null") + PARSER.add_argument("--afm_memory", help="AFM node memory", default=32) + PARSER.add_argument("--afm_vcpus_count", help="AFM node vcpus count", default=8) + PARSER.add_argument("--afm_bandwidth", help="AFM node bandwidth", default=16000) + PARSER.add_argument( + "--enable_key_protect", help="enable key protect", default="null" + ) + ARGUMENTS = PARSER.parse_args() + + cluster_type, gui_username, gui_password = None, None, None + profile_path, replica_config, scale_config = None, None, {} + # Step-1: Read the inventory file + TF = read_json_file(ARGUMENTS.tf_inv_path) + if ARGUMENTS.verbose: + print("Parsed terraform output: %s" % json.dumps(TF, indent=4)) + + # Step-2: Identify the cluster type + if ( + len(TF["storage_cluster_instance_private_ips"]) == 0 + and len(TF["compute_cluster_instance_private_ips"]) > 0 + ): + cluster_type = "compute" + cleanup( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s_cluster_gui_details.json" + % (str(pathlib.PurePath(ARGUMENTS.tf_inv_path).parent), cluster_type) + ) + cleanup( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ) + ) + gui_username = ARGUMENTS.gui_username + gui_password = ARGUMENTS.gui_password + profile_path = "%s/computesncparams" % ARGUMENTS.install_infra_path + replica_config = False + computenodegrp = generate_nodeclass_config( + "computenodegrp", + ARGUMENTS.comp_memory, + ARGUMENTS.comp_vcpus_count, + ARGUMENTS.comp_bandwidth, + ) + managementnodegrp = generate_nodeclass_config( + "managementnodegrp", + ARGUMENTS.mgmt_memory, + ARGUMENTS.mgmt_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + scale_config = initialize_scale_config_details( + [computenodegrp, managementnodegrp] + ) + elif ( + len(TF["compute_cluster_instance_private_ips"]) == 0 + and len(TF["storage_cluster_instance_private_ips"]) > 0 + and len(TF["vpc_availability_zones"]) == 1 + ): + # single az storage cluster + cluster_type = "storage" + cleanup( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s_cluster_gui_details.json" + % (str(pathlib.PurePath(ARGUMENTS.tf_inv_path).parent), cluster_type) + ) + cleanup( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ) + ) + gui_username = ARGUMENTS.gui_username + gui_password = ARGUMENTS.gui_password + profile_path = "%s/storagesncparams" % ARGUMENTS.install_infra_path + replica_config = bool(len(TF["vpc_availability_zones"]) > 1) + + managementnodegrp = generate_nodeclass_config( + "managementnodegrp", + ARGUMENTS.mgmt_memory, + ARGUMENTS.mgmt_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagedescnodegrp = generate_nodeclass_config( + "storagedescnodegrp", + ARGUMENTS.strg_desc_memory, + ARGUMENTS.strg_desc_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagenodegrp = generate_nodeclass_config( + "storagenodegrp", + ARGUMENTS.strg_memory, + ARGUMENTS.strg_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + protocolnodegrp = generate_nodeclass_config( + "protocolnodegrp", + ARGUMENTS.proto_memory, + ARGUMENTS.proto_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storageprotocolnodegrp = generate_nodeclass_config( + "storageprotocolnodegrp", + ARGUMENTS.strg_proto_memory, + ARGUMENTS.strg_proto_vcpus_count, + ARGUMENTS.strg_proto_bandwidth, + ) + afmgatewaygrp = generate_nodeclass_config( + "afmgatewaygrp", + ARGUMENTS.afm_memory, + ARGUMENTS.afm_vcpus_count, + ARGUMENTS.afm_bandwidth, + ) + afmgatewaygrp[1].update(check_afm_values()) + + nodeclassgrp = [storagedescnodegrp, managementnodegrp] + if ARGUMENTS.enable_ces == "True": + if ARGUMENTS.colocate_protocol_instances == "True": + if ARGUMENTS.is_colocate_protocol_subset == "True": + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(storageprotocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(protocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + if ARGUMENTS.enable_afm == "True": + nodeclassgrp.append(afmgatewaygrp) + scale_config = initialize_scale_config_details(nodeclassgrp) + + elif ( + len(TF["compute_cluster_instance_private_ips"]) == 0 + and len(TF["storage_cluster_instance_private_ips"]) > 0 + and len(TF["vpc_availability_zones"]) > 1 + and len(TF["storage_cluster_desc_instance_private_ips"]) > 0 + ): + # multi az storage cluster + cluster_type = "storage" + cleanup( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s_cluster_gui_details.json" + % (str(pathlib.PurePath(ARGUMENTS.tf_inv_path).parent), cluster_type) + ) + cleanup( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ) + ) + gui_username = ARGUMENTS.gui_username + gui_password = ARGUMENTS.gui_password + profile_path = "%s/storagesncparams" % ARGUMENTS.install_infra_path + replica_config = bool(len(TF["vpc_availability_zones"]) > 1) + + managementnodegrp = generate_nodeclass_config( + "managementnodegrp", + ARGUMENTS.mgmt_memory, + ARGUMENTS.mgmt_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagedescnodegrp = generate_nodeclass_config( + "storagedescnodegrp", + ARGUMENTS.strg_desc_memory, + ARGUMENTS.strg_desc_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagenodegrp = generate_nodeclass_config( + "storagenodegrp", + ARGUMENTS.strg_memory, + ARGUMENTS.strg_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + protocolnodegrp = generate_nodeclass_config( + "protocolnodegrp", + ARGUMENTS.proto_memory, + ARGUMENTS.proto_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storageprotocolnodegrp = generate_nodeclass_config( + "storageprotocolnodegrp", + ARGUMENTS.strg_proto_memory, + ARGUMENTS.strg_proto_vcpus_count, + ARGUMENTS.strg_proto_bandwidth, + ) + afmgatewaygrp = generate_nodeclass_config( + "afmgatewaygrp", + ARGUMENTS.afm_memory, + ARGUMENTS.afm_vcpus_count, + ARGUMENTS.afm_bandwidth, + ) + afmgatewaygrp[1].update(check_afm_values()) + + nodeclassgrp = [storagedescnodegrp, managementnodegrp] + if ARGUMENTS.enable_ces == "True": + if ARGUMENTS.colocate_protocol_instances == "True": + if ARGUMENTS.is_colocate_protocol_subset == "True": + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(storageprotocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(protocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + if ARGUMENTS.enable_afm == "True": + nodeclassgrp.append(afmgatewaygrp) + scale_config = initialize_scale_config_details(nodeclassgrp) + + else: + cluster_type = "combined" + cleanup( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + cleanup( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ) + ) + gui_username = ARGUMENTS.gui_username + gui_password = ARGUMENTS.gui_password + profile_path = "%s/scalesncparams" % ARGUMENTS.install_infra_path + replica_config = bool(len(TF["vpc_availability_zones"]) > 1) + + computenodegrp = generate_nodeclass_config( + "computenodegrp", + ARGUMENTS.comp_memory, + ARGUMENTS.comp_vcpus_count, + ARGUMENTS.comp_bandwidth, + ) + managementnodegrp = generate_nodeclass_config( + "managementnodegrp", + ARGUMENTS.mgmt_memory, + ARGUMENTS.mgmt_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagedescnodegrp = generate_nodeclass_config( + "storagedescnodegrp", + ARGUMENTS.strg_desc_memory, + ARGUMENTS.strg_desc_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storagenodegrp = generate_nodeclass_config( + "storagenodegrp", + ARGUMENTS.strg_memory, + ARGUMENTS.strg_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + protocolnodegrp = generate_nodeclass_config( + "protocolnodegrp", + ARGUMENTS.proto_memory, + ARGUMENTS.proto_vcpus_count, + ARGUMENTS.strg_bandwidth, + ) + storageprotocolnodegrp = generate_nodeclass_config( + "storageprotocolnodegrp", + ARGUMENTS.strg_proto_memory, + ARGUMENTS.strg_proto_vcpus_count, + ARGUMENTS.strg_proto_bandwidth, + ) + afmgatewaygrp = generate_nodeclass_config( + "afmgatewaygrp", + ARGUMENTS.afm_memory, + ARGUMENTS.afm_vcpus_count, + ARGUMENTS.afm_bandwidth, + ) + afmgatewaygrp[1].update(check_afm_values()) + + if len(TF["vpc_availability_zones"]) == 1: + nodeclassgrp = [storagedescnodegrp, managementnodegrp, computenodegrp] + if ARGUMENTS.enable_ces == "True": + if ARGUMENTS.colocate_protocol_instances == "True": + if ARGUMENTS.is_colocate_protocol_subset == "True": + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(storageprotocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(protocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + if ARGUMENTS.enable_afm == "True": + nodeclassgrp.append(afmgatewaygrp) + scale_config = initialize_scale_config_details(nodeclassgrp) + else: + nodeclassgrp = [storagedescnodegrp, managementnodegrp, computenodegrp] + if ARGUMENTS.enable_ces == "True": + if ARGUMENTS.colocate_protocol_instances == "True": + if ARGUMENTS.is_colocate_protocol_subset == "True": + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(storageprotocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + nodeclassgrp.append(protocolnodegrp) + else: + nodeclassgrp.append(storagenodegrp) + if ARGUMENTS.enable_afm == "True": + nodeclassgrp.append(afmgatewaygrp) + scale_config = initialize_scale_config_details(nodeclassgrp) + + print("Identified cluster type: %s" % cluster_type) + + # Step-3: Identify if tie breaker needs to be counted for storage + if len(TF["vpc_availability_zones"]) > 1: + total_node_count = ( + len(TF["compute_cluster_instance_private_ips"]) + + len(TF["storage_cluster_desc_instance_private_ips"]) + + len(TF["storage_cluster_instance_private_ips"]) + ) + else: + total_node_count = len(TF["compute_cluster_instance_private_ips"]) + len( + TF["storage_cluster_instance_private_ips"] + ) + + if ARGUMENTS.verbose: + print("Total node count: ", total_node_count) + + # Determine total number of quorum, manager nodes to be in the cluster + # manager designates the node as part of the pool of nodes from which + # file system managers and token managers are selected. + quorum_count, manager_count = 0, 2 + if total_node_count < 4: + quorum_count = total_node_count + elif 4 <= total_node_count < 10: + quorum_count = 3 + elif 10 <= total_node_count < 19: + quorum_count = 5 + else: + quorum_count = 7 + + if ARGUMENTS.verbose: + print("Total quorum count: ", quorum_count) + + # Step-4: Create playbook + if ( + ARGUMENTS.using_packer_image == "false" + and ARGUMENTS.using_rest_initialization == "true" + ): + playbook_content = prepare_ansible_playbook( + "scale_nodes", + "%s_cluster_config.yaml" % cluster_type, + ARGUMENTS.instance_private_key, + ) + write_to_file( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ), + playbook_content, + ) + elif ( + ARGUMENTS.using_packer_image == "true" + and ARGUMENTS.using_rest_initialization == "true" + ): + playbook_content = prepare_packer_ansible_playbook( + "scale_nodes", "%s_cluster_config.yaml" % cluster_type + ) + write_to_file( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ), + playbook_content, + ) + elif ( + ARGUMENTS.using_packer_image == "false" + and ARGUMENTS.using_rest_initialization == "false" + ): + playbook_content = prepare_nogui_ansible_playbook( + "scale_nodes", "%s_cluster_config.yaml" % cluster_type + ) + write_to_file( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ), + playbook_content, + ) + elif ( + ARGUMENTS.using_packer_image == "true" + and ARGUMENTS.using_rest_initialization == "false" + ): + playbook_content = prepare_nogui_packer_ansible_playbook( + "scale_nodes", "%s_cluster_config.yaml" % cluster_type + ) + write_to_file( + "/%s/%s/%s_cloud_playbook.yaml" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ), + playbook_content, + ) + if ARGUMENTS.verbose: + print("Content of ansible playbook:\n", playbook_content) + + # Step-4.1: Create Encryption playbook + if ( + ARGUMENTS.scale_encryption_enabled == "true" + and ARGUMENTS.scale_encryption_type == "gklm" + ): + encryption_playbook_content = prepare_ansible_playbook_encryption_gklm() + write_to_file( + "%s/%s/encryption_gklm_playbook.yaml" + % (ARGUMENTS.install_infra_path, "ibm-spectrum-scale-install-infra"), + encryption_playbook_content, + ) + encryption_playbook_content = prepare_ansible_playbook_encryption_cluster( + "scale_nodes" + ) + write_to_file( + "%s/%s/encryption_cluster_playbook.yaml" + % (ARGUMENTS.install_infra_path, "ibm-spectrum-scale-install-infra"), + encryption_playbook_content, + ) + if ARGUMENTS.verbose: + print( + "Content of ansible playbook for encryption:\n", encryption_playbook_content + ) + + # Step-5: Create hosts + config = configparser.ConfigParser(allow_no_value=True) + node_details = initialize_node_details( + len(TF["vpc_availability_zones"]), + cluster_type, + TF["compute_cluster_instance_names"], + TF["storage_cluster_instance_private_ips"], + TF["storage_cluster_instance_names"], + list(TF["storage_cluster_with_data_volume_mapping"].keys()), + TF["afm_cluster_instance_names"], + TF["protocol_cluster_instance_names"], + TF["storage_cluster_desc_instance_private_ips"], + quorum_count, + "root", + ARGUMENTS.instance_private_key, + ) + node_template = "" + for each_entry in node_details: + if ARGUMENTS.bastion_ssh_private_key is None: + each_entry = each_entry + " " + "ansible_ssh_common_args=" "" + node_template = node_template + each_entry + "\n" + else: + proxy_command = f"ssh -p 22 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -W %h:%p {ARGUMENTS.bastion_user}@{ARGUMENTS.bastion_ip} -i {ARGUMENTS.bastion_ssh_private_key}" + each_entry = ( + each_entry + + " " + + "ansible_ssh_common_args='-o ControlMaster=auto -o ControlPersist=30m -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ProxyCommand=\"" + + proxy_command + + "\"'" + ) + node_template = node_template + each_entry + "\n" + + if TF["resource_prefix"]: + cluster_name = TF["resource_prefix"] + else: + cluster_name = "%s.%s" % ("spectrum-scale", cluster_type) + + config["all:vars"] = initialize_cluster_details( + TF["scale_version"], + cluster_name, + cluster_type, + gui_username, + gui_password, + profile_path, + replica_config, + ARGUMENTS.enable_mrot_conf, + ARGUMENTS.enable_ces, + ARGUMENTS.enable_afm, + ARGUMENTS.enable_key_protect, + TF["storage_subnet_cidr"], + TF["compute_subnet_cidr"], + TF["protocol_gateway_ip"], + TF["scale_remote_cluster_clustername"], + ARGUMENTS.scale_encryption_servers, + ARGUMENTS.scale_encryption_admin_password, + ARGUMENTS.scale_encryption_type, + ARGUMENTS.scale_encryption_enabled, + TF["filesystem_mountpoint"], + TF["vpc_region"], + ARGUMENTS.enable_ldap, + ARGUMENTS.ldap_basedns, + ARGUMENTS.ldap_server, + ARGUMENTS.ldap_admin_password, + TF["afm_cos_bucket_details"], + TF["afm_config_details"], + ) + with open( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ), + "w", + ) as configfile: + configfile.write("[scale_nodes]" + "\n") + configfile.write(node_template) + config.write(configfile) + + if ARGUMENTS.verbose: + config.read( + "%s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + print( + "Content of %s/%s/%s_inventory.ini" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + cluster_type, + ) + ) + print("[scale_nodes]") + print(node_template) + print("[all:vars]") + for each_key in config["all:vars"]: + print("%s: %s" % (each_key, config.get("all:vars", each_key))) + + # Step-6: Create group_vars directory + create_directory( + "%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + ) + ) + # Step-7: Create group_vars + with open( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ), + "w", + ) as groupvar: + yaml.dump(scale_config, groupvar, default_flow_style=False) + if ARGUMENTS.verbose: + print( + "group_vars content:\n%s" + % yaml.dump(scale_config, default_flow_style=False) + ) + + if cluster_type in ["storage", "combined"]: + disks_list = get_disks_list( + len(TF["vpc_availability_zones"]), + TF["storage_cluster_with_data_volume_mapping"], + TF["storage_cluster_desc_data_volume_mapping"], + ARGUMENTS.disk_type, + ) + scale_storage = initialize_scale_storage_details( + len(TF["vpc_availability_zones"]), + TF["storage_cluster_filesystem_mountpoint"], + TF["filesystem_block_size"], + disks_list, + int(ARGUMENTS.default_metadata_replicas), + int(ARGUMENTS.max_metadata_replicas), + int(ARGUMENTS.default_data_replicas), + int(ARGUMENTS.max_data_replicas), + TF["filesets"], + ) + scale_protocols = initialize_scale_ces_details( + TF["smb"], + TF["nfs"], + TF["object"], + TF["export_ip_pool"], + TF["filesystem"], + TF["mountpoint"], + TF["filesets"], + TF["protocol_cluster_instance_names"], + ARGUMENTS.enable_ces, + ) + scale_storage_cluster = { + "scale_protocols": scale_protocols["scale_protocols"], + "scale_storage": scale_storage["scale_storage"], + } + with open( + "%s/%s/%s/%s" + % ( + ARGUMENTS.install_infra_path, + "ibm-spectrum-scale-install-infra", + "group_vars", + "%s_cluster_config.yaml" % cluster_type, + ), + "a", + ) as groupvar: + yaml.dump(scale_storage_cluster, groupvar, default_flow_style=False) + if ARGUMENTS.verbose: + print( + "group_vars content:\n%s" + % yaml.dump(scale_storage_cluster, default_flow_style=False) + ) diff --git a/modules/common/scripts/wait_for_ssh_availability.py b/modules/common/scripts/wait_for_ssh_availability.py new file mode 100755 index 00000000..ada2d58a --- /dev/null +++ b/modules/common/scripts/wait_for_ssh_availability.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Copyright IBM Corporation 2018 + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import json +import subprocess +import sys + + +def read_json_file(json_path): + """Read inventory as json file""" + tf_inv = {} + try: + with open(json_path) as json_handler: + try: + tf_inv = json.load(json_handler) + except json.decoder.JSONDecodeError: + print( + "Provided terraform inventory file (%s) is not a valid json." + % json_path + ) + sys.exit(1) + except OSError: + print("Provided terraform inventory file (%s) does not exist." % json_path) + sys.exit(1) + + return tf_inv + + +def local_execution(command_list): + """ + Helper to execute command locally (stores o/p in variable). + :arg: command_list (list) + :return: (out, err, command_pipe.returncode) + """ + sub_command = subprocess.Popen( + command_list, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + out, err = sub_command.communicate() + return out, err, sub_command.returncode + + +def aws_ec2_wait_running(instance_ids, region): + """ + Wait for EC2 instances to obtain running-ok state. + :args: region(string), instance_ids(list) + """ + print("Waiting for instance's (%s) to obtain running-ok state." % instance_ids) + command = [ + "aws", + "ec2", + "wait", + "instance-status-ok", + "--region", + region, + "--instance-ids", + ] + instance_ids + out, err, code = local_execution(command) + + if code: + print("Instance's did not obtain running-ok state. Existing!") + print("%s: %s %s: %s" % ("stdout", out, "stderr", err)) + sys.exit(1) + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description="Wait for instances to achieve okay state." + ) + PARSER.add_argument( + "--tf_inv_path", required=True, help="Terraform inventory file path" + ) + PARSER.add_argument( + "--cluster_type", + required=True, + help="Cluster type (Ex: compute, storage, combined", + ) + PARSER.add_argument("--verbose", action="store_true", help="print log messages") + ARGUMENTS = PARSER.parse_args() + + # Step-1: Read the inventory file + TF = read_json_file(ARGUMENTS.tf_inv_path) + if ARGUMENTS.verbose: + print("Parsed terraform output: %s" % json.dumps(TF, indent=4)) + + # Step-2: Identify instance id's based cluster_type + target_instance_ids = [] + if TF["cloud_platform"].upper() == "AWS": + if ARGUMENTS.cluster_type == "compute": + target_instance_ids = TF["compute_cluster_instance_ids"] + if TF["bastion_instance_id"] != "None": + target_instance_ids.append(TF["bastion_instance_id"]) + elif ARGUMENTS.cluster_type == "storage": + target_instance_ids = ( + TF["storage_cluster_instance_ids"] + + TF["storage_cluster_desc_instance_ids"] + ) + if TF["bastion_instance_id"] != "None": + target_instance_ids.append(TF["bastion_instance_id"]) + elif ARGUMENTS.cluster_type == "combined": + target_instance_ids = ( + TF["compute_cluster_instance_ids"] + + TF["storage_cluster_instance_ids"] + + TF["storage_cluster_desc_instance_ids"] + ) + if TF["bastion_instance_id"] != "None": + target_instance_ids.append(TF["bastion_instance_id"]) + aws_ec2_wait_running(target_instance_ids, TF["vpc_region"]) diff --git a/modules/common/storage_configuration/locals.tf b/modules/common/storage_configuration/locals.tf new file mode 100644 index 00000000..6780560f --- /dev/null +++ b/modules/common/storage_configuration/locals.tf @@ -0,0 +1,21 @@ +locals { + scripts_path = replace(path.module, "storage_configuration", "scripts") + ansible_inv_script_path = var.inventory_format == "ini" ? format("%s/prepare_scale_inv_ini.py", local.scripts_path) : format("%s/prepare_scale_inv_json.py", local.scripts_path) + wait_for_ssh_script_path = format("%s/wait_for_ssh_availability.py", local.scripts_path) + scale_tuning_config_path = format("%s/%s", var.clone_path, "storagesncparams.profile") + storage_private_key = format("%s/storage_key/id_rsa", var.clone_path) #tfsec:ignore:GEN002 + default_metadata_replicas = var.default_metadata_replicas == null ? jsonencode("None") : jsonencode(var.default_metadata_replicas) + default_data_replicas = var.default_data_replicas == null ? jsonencode("None") : jsonencode(var.default_data_replicas) + storage_inventory_path = format("%s/%s/storage_inventory.ini", var.clone_path, "ibm-spectrum-scale-install-infra") + storage_playbook_path = format("%s/%s/storage_cloud_playbook.yaml", var.clone_path, "ibm-spectrum-scale-install-infra") + scale_encryption_servers = var.scale_encryption_enabled && var.scale_encryption_type == "gklm" ? jsonencode(var.scale_encryption_servers) : jsonencode("None") + scale_encryption_admin_password = var.scale_encryption_enabled ? var.scale_encryption_admin_password : "None" + ldap_server_cert_path = format("%s/ldap_key/ldap_cacert.pem", var.clone_path) + colocate_protocol_instances = var.colocate_protocol_instances ? "True" : "False" + is_colocate_protocol_subset = var.is_colocate_protocol_subset ? "True" : "False" + enable_mrot_conf = var.enable_mrot_conf ? "True" : "False" + enable_ces = var.enable_ces ? "True" : "False" + enable_afm = var.enable_afm ? "True" : "False" + enable_key_protect = var.enable_key_protect == "key_protect" ? "True" : "False" + ldap_server = jsonencode(var.ldap_server) +} diff --git a/modules/common/storage_configuration/main.tf b/modules/common/storage_configuration/main.tf new file mode 100644 index 00000000..e69de29b diff --git a/modules/common/storage_configuration/outputs.tf b/modules/common/storage_configuration/outputs.tf new file mode 100644 index 00000000..77b3ecfc --- /dev/null +++ b/modules/common/storage_configuration/outputs.tf @@ -0,0 +1,5 @@ +output "storage_cluster_create_complete" { + value = true + depends_on = [time_sleep.wait_60_seconds, null_resource.wait_for_ssh_availability, null_resource.prepare_ansible_inventory, null_resource.prepare_ansible_inventory_using_jumphost_connection, null_resource.prepare_ansible_inventory_encryption, null_resource.prepare_ansible_inventory_using_jumphost_connection_encryption, null_resource.perform_scale_deployment] + description = "Storage cluster create complete" +} diff --git a/modules/common/storage_configuration/storage_configuration.tf b/modules/common/storage_configuration/storage_configuration.tf new file mode 100644 index 00000000..162d5d68 --- /dev/null +++ b/modules/common/storage_configuration/storage_configuration.tf @@ -0,0 +1,120 @@ +/* + Excutes ansible playbook to install IBM Spectrum Scale storage cluster. +*/ + +resource "local_file" "create_storage_tuning_parameters" { + count = (tobool(var.turn_on) == true && tobool(var.write_inventory_complete) == true) ? 1 : 0 + content = < { + sequence_string = tostring(count_number) + } + } + name = format("%s-%03s", "${var.prefix}instance", each.value.sequence_string) + resource_group_id = var.resource_group_id + plan = var.cos_instance_plan + location = var.cos_instance_location + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "cos_bucket_single_site" { + for_each = { + for idx, count_number in range(1, length(local.new_bucket_single_site_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in ibm_resource_instance.cos_instance : instance_id[*].id]), idx) + region_location = element(local.new_bucket_single_site_region, idx) + storage_class = element(local.storage_class_single_site, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket-new", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + single_site_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [ibm_resource_instance.cos_instance] +} + +resource "ibm_cos_bucket" "cos_bucket_regional" { + for_each = { + for idx, count_number in range(1, length(local.new_bucket_regional_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in ibm_resource_instance.cos_instance : instance_id[*].id]), idx) + region_location = element(local.new_bucket_regional_region, idx) + storage_class = element(local.storage_class_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket-new", (each.value.sequence_string + length(local.new_bucket_single_site_region))) + resource_instance_id = each.value.cos_instance + region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [ibm_resource_instance.cos_instance] +} + +resource "ibm_cos_bucket" "cos_bucket_cross_region" { + for_each = { + for idx, count_number in range(1, length(local.new_bucket_cross_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in ibm_resource_instance.cos_instance : instance_id[*].id]), idx) + region_location = element(local.new_bucket_cross_region, idx) + storage_class = element(local.storage_class_cross_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket-new", (each.value.sequence_string + (length(local.new_bucket_single_site_region) + length(local.new_bucket_regional_region)))) + resource_instance_id = each.value.cos_instance + cross_region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [ibm_resource_instance.cos_instance] +} + +resource "ibm_resource_key" "hmac_key" { + for_each = { + for idx, count_number in range(1, length(local.new_cos_instance) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in ibm_resource_instance.cos_instance : instance_id[*].id]), idx) + } + } + name = format("%s-%03s", "${var.prefix}hmac-key-new", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + parameters = { "HMAC" = true } + role = var.cos_hmac_role +} + +locals { + buckets = concat((flatten([for bucket in ibm_cos_bucket.cos_bucket_single_site : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.cos_bucket_regional : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.cos_bucket_cross_region : bucket[*].bucket_name]))) + endpoints = concat((flatten([for endpoint in ibm_cos_bucket.cos_bucket_single_site : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.cos_bucket_regional : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.cos_bucket_cross_region : endpoint[*].s3_endpoint_direct]))) + modes = concat(local.mode_single_site, local.mode_regional, local.mode_cross_regional) + filesets = concat(local.afm_fileset_single_site, local.afm_fileset_regional, local.fileset_cross_regional) + + + afm_cos_bucket_details_1 = [for idx, config in var.new_instance_bucket_hmac : { + akey = ibm_resource_key.hmac_key[0].credentials["cos_hmac_keys.access_key_id"] + bucket = (local.buckets)[idx] + skey = ibm_resource_key.hmac_key[0].credentials["cos_hmac_keys.secret_access_key"] + }] + + afm_config_details_1 = [for idx, config in var.new_instance_bucket_hmac : { + bucket = (local.buckets)[idx] + filesystem = local.filesystem + fileset = (local.filesets)[idx] + mode = (local.modes)[idx] + endpoint = "https://${(local.endpoints)[idx]}" + }] +} + +############################################################################################################# +# 2. It uses existing COS instance and creates new COS Bucket and Hmac Key in that instance. +############################################################################################################# + +locals { + exstng_instance_new_bkt_hmac = [for instance in var.exstng_instance_new_bucket_hmac : instance.cos_instance] + # New bucket single Site + exstng_instance_new_bkt_hmac_single_site = [for instance in var.exstng_instance_new_bucket_hmac : instance.cos_instance if instance.bucket_type == "single_site_location"] + exstng_instance_single_site_region = [for region in var.exstng_instance_new_bucket_hmac : region.bucket_region if region.bucket_type == "single_site_location"] + exstng_instance_storage_class_single_site = [for class in var.exstng_instance_new_bucket_hmac : class.bucket_storage_class if class.bucket_type == "single_site_location"] + exstng_instance_mode_single_site = [for mode in var.exstng_instance_new_bucket_hmac : mode.mode if mode.bucket_type == "single_site_location"] + exstng_instance_fileset_single_site = [for fileset in var.exstng_instance_new_bucket_hmac : fileset.afm_fileset if fileset.bucket_type == "single_site_location"] + # New bucket regional + exstng_instance_new_bkt_hmac_regional = [for instance in var.exstng_instance_new_bucket_hmac : instance.cos_instance if instance.bucket_type == "region_location" || instance.bucket_type == ""] + exstng_instance_regional_region = [for region in var.exstng_instance_new_bucket_hmac : region.bucket_region if region.bucket_type == "region_location" || region.bucket_type == ""] + exstng_instance_storage_class_regional = [for class in var.exstng_instance_new_bucket_hmac : class.bucket_storage_class if class.bucket_type == "region_location" || class.bucket_type == ""] + exstng_instance_mode_regional = [for mode in var.exstng_instance_new_bucket_hmac : mode.mode if mode.bucket_type == "region_location" || mode.bucket_type == ""] + exstng_instance_fileset_regional = [for fileset in var.exstng_instance_new_bucket_hmac : fileset.afm_fileset if fileset.bucket_type == "region_location" || fileset.bucket_type == ""] + # New bucket cross region + exstng_instance_new_bkt_hmac_cross_regional = [for instance in var.exstng_instance_new_bucket_hmac : instance.cos_instance if instance.bucket_type == "cross_region_location"] + exstng_instance_cross_regional = [for region in var.exstng_instance_new_bucket_hmac : region.bucket_region if region.bucket_type == "cross_region_location"] + exstng_instance_storage_class_cross_regional = [for class in var.exstng_instance_new_bucket_hmac : class.bucket_storage_class if class.bucket_type == "cross_region_location"] + exstng_instance_mode_cross_regional = [for mode in var.exstng_instance_new_bucket_hmac : mode.mode if mode.bucket_type == "cross_region_location"] + exstng_instance_fileset_cross_regional = [for fileset in var.exstng_instance_new_bucket_hmac : fileset.afm_fileset if fileset.bucket_type == "cross_region_location"] +} + +data "ibm_resource_instance" "existing_cos_instance_single_site" { + for_each = { + for idx, value in local.exstng_instance_new_bkt_hmac_single_site : idx => { + cos_instance = element(local.exstng_instance_new_bkt_hmac_single_site, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_instance_new_cos_bucket_single_site" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_single_site_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.existing_cos_instance_single_site : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_single_site_region, idx) + storage_class = element(local.exstng_instance_storage_class_single_site, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + single_site_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.existing_cos_instance_single_site] +} + +data "ibm_resource_instance" "existing_cos_instance_bucket_regional" { + for_each = { + for idx, value in local.exstng_instance_new_bkt_hmac_regional : idx => { + cos_instance = element(local.exstng_instance_new_bkt_hmac_regional, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_instance_new_cos_bucket_regional" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_regional_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.existing_cos_instance_bucket_regional : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_regional_region, idx) + storage_class = element(local.exstng_instance_storage_class_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket", (each.value.sequence_string + length(local.exstng_instance_single_site_region))) + resource_instance_id = each.value.cos_instance + region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.existing_cos_instance_bucket_regional] +} + +data "ibm_resource_instance" "existing_cos_instancecross_regional" { + for_each = { + for idx, value in local.exstng_instance_new_bkt_hmac_cross_regional : idx => { + cos_instance = element(local.exstng_instance_new_bkt_hmac_cross_regional, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_instance_new_cos_bucket_cross_regional" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_cross_regional) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.existing_cos_instancecross_regional : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_cross_regional, idx) + storage_class = element(local.exstng_instance_storage_class_cross_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}bucket", (each.value.sequence_string + (length(local.exstng_instance_single_site_region) + length(local.exstng_instance_regional_region)))) + resource_instance_id = each.value.cos_instance + cross_region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.existing_cos_instancecross_regional] +} + +data "ibm_resource_instance" "existing_cos_instance" { + for_each = { + for idx, value in local.exstng_instance_new_bkt_hmac : idx => { + cos_instance = element(local.exstng_instance_new_bkt_hmac, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_resource_key" "existing_instance_new_hmac_keys" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_new_bkt_hmac) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.existing_cos_instance : instance_id[*].id]), idx) + } + } + name = format("%s-%03s", "${var.prefix}hmac-key", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + parameters = { "HMAC" = true } + role = var.cos_hmac_role + depends_on = [data.ibm_resource_instance.existing_cos_instance] +} + +locals { + exstng_instance_buckets = concat((flatten([for bucket in ibm_cos_bucket.existing_instance_new_cos_bucket_single_site : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.existing_instance_new_cos_bucket_regional : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.existing_instance_new_cos_bucket_cross_regional : bucket[*].bucket_name]))) + exstng_instance_endpoints = concat((flatten([for endpoint in ibm_cos_bucket.existing_instance_new_cos_bucket_single_site : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.existing_instance_new_cos_bucket_regional : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.existing_instance_new_cos_bucket_cross_regional : endpoint[*].s3_endpoint_direct]))) + exstng_instance_modes = concat(local.exstng_instance_mode_single_site, local.exstng_instance_mode_regional, local.exstng_instance_mode_cross_regional) + exstng_instance_filesets = concat(local.exstng_instance_fileset_single_site, local.exstng_instance_fileset_regional, local.exstng_instance_fileset_cross_regional) + + afm_cos_bucket_details_2 = [for idx, config in var.exstng_instance_new_bucket_hmac : { + akey = (flatten([for access_key in ibm_resource_key.existing_instance_new_hmac_keys : access_key[*].credentials["cos_hmac_keys.access_key_id"]]))[idx] + bucket = (local.exstng_instance_buckets)[idx] + skey = (flatten([for secret_access_key in ibm_resource_key.existing_instance_new_hmac_keys : secret_access_key[*].credentials["cos_hmac_keys.secret_access_key"]]))[idx] + }] + + afm_config_details_2 = [for idx, config in var.exstng_instance_new_bucket_hmac : { + bucket = (local.exstng_instance_buckets)[idx] + filesystem = local.filesystem + fileset = (local.exstng_instance_filesets)[idx] + mode = (local.exstng_instance_modes)[idx] + endpoint = "https://${(local.exstng_instance_endpoints)[idx]}" + }] +} + +############################################################################################################# +# 3. It uses existing COS instance and existing Bucket and creates new Hmac Key in that instance. +############################################################################################################# + +locals { + exstng_instance_bkt_new_hmac = [for instance in var.exstng_instance_bucket_new_hmac : instance.cos_instance] + exstng_instance_exstng_bucket = [for bucket in var.exstng_instance_bucket_new_hmac : bucket.bucket_name] + region_exstng_instance_bucket_new_hmac = [for region in var.exstng_instance_bucket_new_hmac : region.bucket_region] + exstng_instance_exstng_bucket_type = [for type in var.exstng_instance_bucket_new_hmac : type.bucket_type] +} + +data "ibm_resource_instance" "existing_cos_instance_bucket_new_hmac" { + for_each = { + for idx, value in var.exstng_instance_bucket_new_hmac : idx => { + cos_instance = element(local.exstng_instance_bkt_new_hmac, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +data "ibm_cos_bucket" "existing_cos_instance_bucket" { + for_each = { + for idx, value in var.exstng_instance_bucket_new_hmac : idx => { + bucket_name = element(local.exstng_instance_exstng_bucket, idx) + resource_instance_id = element(flatten([for instance in data.ibm_resource_instance.existing_cos_instance_bucket_new_hmac : instance[*].id]), idx) + bucket_region = element(local.region_exstng_instance_bucket_new_hmac, idx) + bucket_type = element(local.exstng_instance_exstng_bucket_type, idx) + } + } + bucket_name = each.value.bucket_name + resource_instance_id = each.value.resource_instance_id + bucket_region = each.value.bucket_region + bucket_type = each.value.bucket_type + depends_on = [data.ibm_resource_instance.existing_cos_instance_bucket_new_hmac] +} + +resource "ibm_resource_key" "existing_instance_bkt_new_hmac_keys" { + for_each = { + for idx, count_number in range(1, length(var.exstng_instance_bucket_new_hmac) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.existing_cos_instance_bucket_new_hmac : instance_id[*].id]), idx) + } + } + name = format("%s-%03s", "${var.prefix}new-hmac-key", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + parameters = { "HMAC" = true } + role = var.cos_hmac_role + depends_on = [data.ibm_resource_instance.existing_cos_instance_bucket_new_hmac] +} + +locals { + afm_cos_bucket_details_3 = [for idx, config in var.exstng_instance_bucket_new_hmac : { + akey = (flatten([for access_key in ibm_resource_key.existing_instance_bkt_new_hmac_keys : access_key[*].credentials["cos_hmac_keys.access_key_id"]]))[idx] + bucket = (flatten([for bucket in data.ibm_cos_bucket.existing_cos_instance_bucket : bucket[*].bucket_name]))[idx] + skey = (flatten([for secret_access_key in ibm_resource_key.existing_instance_bkt_new_hmac_keys : secret_access_key[*].credentials["cos_hmac_keys.secret_access_key"]]))[idx] + }] + + afm_config_details_3 = [for idx, config in var.exstng_instance_bucket_new_hmac : { + bucket = (flatten([for bucket in data.ibm_cos_bucket.existing_cos_instance_bucket : bucket[*].bucket_name]))[idx] + filesystem = local.filesystem + fileset = ([for fileset in var.exstng_instance_bucket_new_hmac : fileset.afm_fileset])[idx] + mode = ([for mode in var.exstng_instance_bucket_new_hmac : mode.mode])[idx] + endpoint = "https://${(flatten([for endpoint in data.ibm_cos_bucket.existing_cos_instance_bucket : endpoint[*].s3_endpoint_direct]))[idx]}" + }] +} + +############################################################################################################# +# 4. It uses existing COS instance and existing Hmac Key and creates new Bucket in that instance. +############################################################################################################# + +locals { + exstng_instance_hmac_new_bkt = [for instance in var.exstng_instance_hmac_new_bucket : instance.cos_instance] + exstng_instance_exstng_hmac = [for hmac in var.exstng_instance_hmac_new_bucket : hmac.cos_service_cred_key] + + # New bucket single Site + exstng_instance_hmac_new_bkt_single_site = [for instance in var.exstng_instance_hmac_new_bucket : instance.cos_instance if instance.bucket_type == "single_site_location"] + exstng_instance_hmac_single_site_region = [for region in var.exstng_instance_hmac_new_bucket : region.bucket_region if region.bucket_type == "single_site_location"] + exstng_instance_hmac_storage_class_single_site = [for class in var.exstng_instance_hmac_new_bucket : class.bucket_storage_class if class.bucket_type == "single_site_location"] + exstng_instance_hmac_mode_single_site = [for mode in var.exstng_instance_hmac_new_bucket : mode.mode if mode.bucket_type == "single_site_location"] + exstng_instance_hmac_fileset_single_site = [for fileset in var.exstng_instance_hmac_new_bucket : fileset.afm_fileset if fileset.bucket_type == "single_site_location"] + # New bucket regional + exstng_instance_hmac_new_bkt_regional = [for instance in var.exstng_instance_hmac_new_bucket : instance.cos_instance if instance.bucket_type == "region_location" || instance.bucket_type == ""] + exstng_instance_hmac_regional_region = [for region in var.exstng_instance_hmac_new_bucket : region.bucket_region if region.bucket_type == "region_location" || region.bucket_type == ""] + exstng_instance_hmac_storage_class_regional = [for class in var.exstng_instance_hmac_new_bucket : class.bucket_storage_class if class.bucket_type == "region_location" || class.bucket_type == ""] + exstng_instance_hmac_mode_regional = [for mode in var.exstng_instance_hmac_new_bucket : mode.mode if mode.bucket_type == "region_location" || mode.bucket_type == ""] + exstng_instance_hmac_fileset_regional = [for fileset in var.exstng_instance_hmac_new_bucket : fileset.afm_fileset if fileset.bucket_type == "region_location" || fileset.bucket_type == ""] + # New bucket cross region + exstng_instance_hmac_new_bkt_cross_region = [for instance in var.exstng_instance_hmac_new_bucket : instance.cos_instance if instance.bucket_type == "cross_region_location"] + exstng_instance_hmac_cross_region = [for region in var.exstng_instance_hmac_new_bucket : region.bucket_region if region.bucket_type == "cross_region_location"] + exstng_instance_hmac_storage_class_cross_regional = [for class in var.exstng_instance_hmac_new_bucket : class.bucket_storage_class if class.bucket_type == "cross_region_location"] + exstng_instance_hmac_mode_cross_regional = [for mode in var.exstng_instance_hmac_new_bucket : mode.mode if mode.bucket_type == "cross_region_location"] + exstng_instance_hmac_fileset_cross_regional = [for fileset in var.exstng_instance_hmac_new_bucket : fileset.afm_fileset if fileset.bucket_type == "cross_region_location"] +} + +data "ibm_resource_instance" "exstng_cos_instance_hmac_new_bucket_single_site" { + for_each = length(local.exstng_instance_hmac_new_bkt_single_site) == 0 ? {} : { + for idx, value in local.exstng_instance_hmac_new_bkt_single_site : idx => { + cos_instance = element(local.exstng_instance_hmac_new_bkt_single_site, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_cos_instance_hmac_new_cos_bucket_single_site" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_hmac_single_site_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_single_site : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_hmac_single_site_region, idx) + storage_class = element(local.exstng_instance_hmac_storage_class_single_site, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}new-bucket", each.value.sequence_string) + resource_instance_id = each.value.cos_instance + single_site_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_single_site] +} + +data "ibm_resource_instance" "exstng_cos_instance_hmac_new_bucket_regional" { + for_each = length(local.exstng_instance_hmac_new_bkt_regional) == 0 ? {} : { + for idx, value in local.exstng_instance_hmac_new_bkt_regional : idx => { + cos_instance = element(local.exstng_instance_hmac_new_bkt_regional, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_cos_instance_hmac_new_cos_bucket_regional" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_hmac_regional_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_regional : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_hmac_regional_region, idx) + storage_class = element(local.exstng_instance_hmac_storage_class_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}new-bucket", (each.value.sequence_string + length(local.exstng_instance_hmac_single_site_region))) + resource_instance_id = each.value.cos_instance + region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_regional] +} + +data "ibm_resource_instance" "exstng_cos_instance_hmac_new_bucket_cross_region" { + for_each = length(local.exstng_instance_hmac_new_bkt_cross_region) == 0 ? {} : { + for idx, value in local.exstng_instance_hmac_new_bkt_cross_region : idx => { + cos_instance = element(local.exstng_instance_hmac_new_bkt_cross_region, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +resource "ibm_cos_bucket" "existing_cos_instance_hmac_new_cos_bucket_cross_region" { + for_each = { + for idx, count_number in range(1, length(local.exstng_instance_hmac_cross_region) + 1) : idx => { + sequence_string = tostring(count_number) + cos_instance = element(flatten([for instance_id in data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_cross_region : instance_id[*].id]), idx) + region_location = element(local.exstng_instance_hmac_cross_region, idx) + storage_class = element(local.exstng_instance_hmac_storage_class_cross_regional, idx) + } + } + bucket_name = format("%s-%03s", "${var.prefix}new-bucket", (each.value.sequence_string + (length(local.exstng_instance_hmac_single_site_region) + length(local.exstng_instance_hmac_regional_region)))) + resource_instance_id = each.value.cos_instance + cross_region_location = each.value.region_location + storage_class = each.value.storage_class == "" ? "smart" : each.value.storage_class + depends_on = [data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket_cross_region] +} + +data "ibm_resource_instance" "exstng_cos_instance_hmac_new_bucket" { + for_each = { + for idx, value in local.exstng_instance_hmac_new_bkt : idx => { + cos_instance = element(local.exstng_instance_hmac_new_bkt, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +data "ibm_resource_key" "existing_hmac_key" { + for_each = { + for idx, value in local.exstng_instance_exstng_hmac : idx => { + hmac_key = element(local.exstng_instance_exstng_hmac, idx) + resource_instance_id = element(flatten([for instance in data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket : instance[*].id]), idx) + } + } + name = each.value.hmac_key + resource_instance_id = each.value.resource_instance_id + depends_on = [data.ibm_resource_instance.exstng_cos_instance_hmac_new_bucket] +} + +locals { + exstng_instance_hmac_buckets = concat((flatten([for bucket in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_single_site : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_regional : bucket[*].bucket_name])), (flatten([for bucket in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_cross_region : bucket[*].bucket_name]))) + exstng_instance_hmac_endpoints = concat((flatten([for endpoint in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_single_site : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_regional : endpoint[*].s3_endpoint_direct])), (flatten([for endpoint in ibm_cos_bucket.existing_cos_instance_hmac_new_cos_bucket_cross_region : endpoint[*].s3_endpoint_direct]))) + exstng_instance_hmac_modes = concat(local.exstng_instance_hmac_mode_single_site, local.exstng_instance_hmac_mode_regional, local.exstng_instance_hmac_mode_cross_regional) + exstng_instance_hmac_filesets = concat(local.exstng_instance_hmac_fileset_single_site, local.exstng_instance_hmac_fileset_regional, local.exstng_instance_hmac_fileset_cross_regional) + + afm_cos_bucket_details_4 = [for idx, config in var.exstng_instance_hmac_new_bucket : { + akey = (flatten([for access_key in data.ibm_resource_key.existing_hmac_key : access_key[*].credentials["cos_hmac_keys.access_key_id"]]))[idx] + bucket = (local.exstng_instance_hmac_buckets)[idx] + skey = (flatten([for secret_access_key in data.ibm_resource_key.existing_hmac_key : secret_access_key[*].credentials["cos_hmac_keys.secret_access_key"]]))[idx] + }] + + afm_config_details_4 = [for idx, config in var.exstng_instance_hmac_new_bucket : { + bucket = (local.exstng_instance_hmac_buckets)[idx] + filesystem = local.filesystem + fileset = (local.exstng_instance_hmac_filesets)[idx] + mode = (local.exstng_instance_hmac_modes)[idx] + endpoint = "https://${(local.exstng_instance_hmac_endpoints)[idx]}" + }] +} + +############################################################################################################# +# 5. It uses existing COS instance, Bucket and Hmac Key +############################################################################################################# + +locals { + exstng_instance_bkt_hmac = [for instance in var.exstng_instance_bucket_hmac : instance.cos_instance] + exstng_instance_exstng_bkt = [for bucket in var.exstng_instance_bucket_hmac : bucket.bucket_name] + exstng_instance_hmac_bkt = [for hmac in var.exstng_instance_bucket_hmac : hmac.cos_service_cred_key] + region_exstng_instance_bucket_hmac = [for region in var.exstng_instance_bucket_hmac : region.bucket_region] + exstng_instance_bkt_type = [for type in var.exstng_instance_bucket_hmac : type.bucket_type] +} + + +data "ibm_resource_instance" "exstng_cos_instance_bucket_hmac" { + for_each = { + for idx, value in var.exstng_instance_bucket_hmac : idx => { + cos_instance = element(local.exstng_instance_bkt_hmac, idx) + } + } + name = each.value.cos_instance + service = var.cos_instance_service +} + +data "ibm_cos_bucket" "exstng_cos_instance_bucket" { + for_each = { + for idx, value in var.exstng_instance_bucket_hmac : idx => { + bucket_name = element(local.exstng_instance_exstng_bkt, idx) + resource_instance_id = element(flatten([for instance in data.ibm_resource_instance.exstng_cos_instance_bucket_hmac : instance[*].id]), idx) + bucket_region = element(local.region_exstng_instance_bucket_hmac, idx) + bucket_type = element(local.exstng_instance_bkt_type, idx) + } + } + bucket_name = each.value.bucket_name + resource_instance_id = each.value.resource_instance_id + bucket_region = each.value.bucket_region + bucket_type = each.value.bucket_type + depends_on = [data.ibm_resource_instance.exstng_cos_instance_bucket_hmac] +} + +data "ibm_resource_key" "exstng_cos_instance_hmac" { + for_each = { + for idx, value in var.exstng_instance_bucket_hmac : idx => { + hmac_key = element(local.exstng_instance_hmac_bkt, idx) + resource_instance_id = element(flatten([for instance in data.ibm_resource_instance.exstng_cos_instance_bucket_hmac : instance[*].id]), idx) + } + } + name = each.value.hmac_key + resource_instance_id = each.value.resource_instance_id + depends_on = [data.ibm_resource_instance.exstng_cos_instance_bucket_hmac] +} + +locals { + afm_cos_bucket_details_5 = [for idx, config in var.exstng_instance_bucket_hmac : { + akey = (flatten([for access_key in data.ibm_resource_key.exstng_cos_instance_hmac : access_key[*].credentials["cos_hmac_keys.access_key_id"]]))[idx] + bucket = (flatten([for bucket in data.ibm_cos_bucket.exstng_cos_instance_bucket : bucket[*].bucket_name]))[idx] + skey = (flatten([for secret_access_key in data.ibm_resource_key.exstng_cos_instance_hmac : secret_access_key[*].credentials["cos_hmac_keys.secret_access_key"]]))[idx] + }] + + afm_config_details_5 = [for idx, config in var.exstng_instance_bucket_hmac : { + bucket = (flatten([for bucket in data.ibm_cos_bucket.exstng_cos_instance_bucket : bucket[*].bucket_name]))[idx] + filesystem = local.filesystem + fileset = ([for fileset in var.exstng_instance_bucket_hmac : fileset.afm_fileset])[idx] + mode = ([for mode in var.exstng_instance_bucket_hmac : mode.mode])[idx] + endpoint = "https://${(flatten([for endpoint in data.ibm_cos_bucket.exstng_cos_instance_bucket : endpoint[*].s3_endpoint_direct]))[idx]}" + }] +} diff --git a/modules/cos/main.tf b/modules/cos/main.tf new file mode 100644 index 00000000..e69de29b diff --git a/modules/cos/outputs.tf b/modules/cos/outputs.tf new file mode 100644 index 00000000..44aa1d99 --- /dev/null +++ b/modules/cos/outputs.tf @@ -0,0 +1,9 @@ +output "afm_cos_bucket_details" { + value = concat(local.afm_cos_bucket_details_1, local.afm_cos_bucket_details_2, local.afm_cos_bucket_details_3, local.afm_cos_bucket_details_4, local.afm_cos_bucket_details_5) + description = "AFM cos bucket details" +} + +output "afm_config_details" { + value = concat(local.afm_config_details_1, local.afm_config_details_2, local.afm_config_details_3, local.afm_config_details_4, local.afm_config_details_5) + description = "AFM configuration details" +} diff --git a/modules/cos/variables.tf b/modules/cos/variables.tf new file mode 100644 index 00000000..5008046d --- /dev/null +++ b/modules/cos/variables.tf @@ -0,0 +1,109 @@ +variable "prefix" { + type = string + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." +} + +variable "resource_group_id" { + type = string + description = "Resource group id." +} + +variable "cos_instance_plan" { + type = string + description = "COS instance plan." +} +variable "cos_instance_location" { + type = string + description = "COS instance location." +} + +variable "cos_instance_service" { + type = string + description = "COS instance service." +} + +variable "cos_hmac_role" { + type = string + description = "HMAC key role." +} + +variable "new_instance_bucket_hmac" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + description = "It creates new COS instance, Bucket and Hmac Key" +} +variable "exstng_instance_new_bucket_hmac" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + description = "It creates new COS instance, Bucket and Hmac Key" +} +variable "exstng_instance_bucket_new_hmac" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + description = "It creates new COS instance, Bucket and Hmac Key" +} +variable "exstng_instance_hmac_new_bucket" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + description = "It creates new COS instance, Bucket and Hmac Key" +} +variable "exstng_instance_bucket_hmac" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + description = "It creates new COS instance, Bucket and Hmac Key" +} + +variable "filesystem" { + type = string + description = "Storage filesystem name." +} diff --git a/modules/cos/version.tf b/modules/cos/version.tf new file mode 100644 index 00000000..913bf325 --- /dev/null +++ b/modules/cos/version.tf @@ -0,0 +1,22 @@ +############################################################################## +# Terraform Providers +############################################################################## + +terraform { + required_version = ">= 1.3" + # Use "greater than or equal to" range for root level modules + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + # local = { + # source = "hashicorp/local" + # version = "~> 2" + # } + # ansible = { + # source = "ansible/ansible" + # version = "~> 1.3.0" + # } + } +} diff --git a/modules/database/mysql/main.tf b/modules/database/mysql/main.tf deleted file mode 100644 index 8d8cb584..00000000 --- a/modules/database/mysql/main.tf +++ /dev/null @@ -1,25 +0,0 @@ -################################################################################ -# database/mysql/main.tf - Creating a MySQL database -################################################################################ -# Copyright 2023 IBM -# -# Licensed under the MIT License. See the LICENSE file for details. -# -# Maintainer: Salvatore D'Angelo -################################################################################ - -module "db" { - source = "terraform-ibm-modules/icd-mysql/ibm" - version = "1.3.9" - resource_group_id = var.resource_group_id - name = var.name - region = var.region - service_endpoints = var.service_endpoints - mysql_version = var.mysql_version - admin_pass = var.admin_password - members = var.members - member_memory_mb = var.memory - member_disk_mb = var.disks - member_cpu_count = var.vcpu - member_host_flavor = var.host_flavour -} diff --git a/modules/database/mysql/outputs.tf b/modules/database/mysql/outputs.tf deleted file mode 100644 index 3c409ef0..00000000 --- a/modules/database/mysql/outputs.tf +++ /dev/null @@ -1,16 +0,0 @@ -output "db_instance_info" { - description = "Database instance information" - value = { - id = module.db.id - admin_user = module.db.adminuser - hostname = module.db.hostname - port = module.db.port - certificate = module.db.certificate_base64 - } -} - -output "db_admin_password" { - description = "Database instance password" - value = var.admin_password - sensitive = true -} diff --git a/modules/database/mysql/variables.tf b/modules/database/mysql/variables.tf deleted file mode 100644 index 949c0dc4..00000000 --- a/modules/database/mysql/variables.tf +++ /dev/null @@ -1,63 +0,0 @@ -variable "name" { - description = "Name of the Database" - type = string -} - -variable "mysql_version" { - description = "MySQL version of the Database" - type = string - default = "8.0" -} - -variable "region" { - description = "The region where the database must be instantiated" - type = string -} - -variable "admin_password" { - description = "The administrator password" - sensitive = true - type = string -} - -variable "resource_group_id" { - description = "Resource group ID" - type = string - default = null -} - -variable "members" { - description = "Number of members" - type = number - default = null -} - -variable "memory" { - description = "Ram in megabyte" - type = number - default = null -} - -variable "disks" { - description = "Rom in megabyte" - type = number - default = null -} - -variable "vcpu" { - description = "Number of cpu cores" - type = number - default = null -} - -variable "host_flavour" { - description = "Allocated host flavor per member." - type = string - default = null -} - -variable "service_endpoints" { - description = "The service endpoints" - type = string - default = "private" -} diff --git a/modules/database/mysql/version.tf b/modules/database/mysql/version.tf deleted file mode 100644 index bfb98ab9..00000000 --- a/modules/database/mysql/version.tf +++ /dev/null @@ -1,3 +0,0 @@ -terraform { - required_version = ">= 1.9.0" -} diff --git a/modules/deployer/datasource.tf b/modules/deployer/datasource.tf new file mode 100644 index 00000000..fc391e8a --- /dev/null +++ b/modules/deployer/datasource.tf @@ -0,0 +1,28 @@ +# data "ibm_resource_group" "existing_resource_group" { +# name = var.existing_resource_group +# } + +data "ibm_is_image" "bastion" { + name = var.bastion_instance["image"] +} + +data "ibm_is_image" "deployer" { + count = local.deployer_image_found_in_map ? 0 : 1 + name = var.deployer_instance["image"] +} + +data "ibm_is_ssh_key" "bastion" { + for_each = toset(var.ssh_keys) + name = each.key +} + +# Existing Bastion details +data "ibm_is_instance" "bastion_instance_name" { + count = var.bastion_instance_name != null ? 1 : 0 + name = var.bastion_instance_name +} + +#Existing Public Gateway attachment +data "ibm_is_public_gateways" "public_gateways" { + count = var.ext_vpc_name != null ? 1 : 0 +} diff --git a/modules/deployer/image_map.tf b/modules/deployer/image_map.tf new file mode 100644 index 00000000..292e2b49 --- /dev/null +++ b/modules/deployer/image_map.tf @@ -0,0 +1,30 @@ +locals { + image_region_map = { + "hpc-lsf-fp15-deployer-rhel810-v1" = { + "eu-es" = "r050-e7b874c1-f370-41c4-8ee6-50efb07aa340" + "eu-gb" = "r018-eb14c522-cb0f-4b72-948f-2c029957665a" + "eu-de" = "r010-00629ef3-324c-4651-a7a7-76830d2ad660" + "us-east" = "r014-ac586488-de00-490e-8962-5e2a7fcab076" + "us-south" = "r006-f2b7871c-54c9-4b02-837c-1d28294f0842" + "jp-tok" = "r022-dd715ea3-d2dc-4936-bff0-51c9cd63b3a9" + "jp-osa" = "r034-82d648ed-fd3e-4248-955c-6009c973aa5f" + "au-syd" = "r026-b47e4863-f5e7-440c-8734-c058f6b8ce33" + "br-sao" = "r042-8b5ac031-3e65-4afb-9679-b7e2b907a2ad" + "ca-tor" = "r038-c55b1ab4-500f-4842-9e78-dc64a16a746a" + "ca-mon" = "r058-fc93c3f9-f97c-4d9b-b8d6-dd40db891913" + }, + "hpc-lsf-fp14-deployer-rhel810-v1" = { + "eu-es" = "r050-a530edc3-d053-41cd-899b-2c61d53d5efd" + "eu-gb" = "r018-b368f002-64ea-48bb-a5f1-77e7891c2691" + "eu-de" = "r010-c5b5f7d9-bc3e-4e18-9724-f682ccfef617" + "us-east" = "r014-9d0c683d-da23-4836-9057-d8732c26010a" + "us-south" = "r006-33e861c5-590f-492d-a97b-eb62e313dc8d" + "jp-tok" = "r022-b02c8618-ea8f-42bf-854a-da5822ee3cb5" + "jp-osa" = "r034-490ee8d9-f5af-410d-9aeb-c6190beefdf6" + "au-syd" = "r026-93a5c85d-8861-46a9-8100-1d3d788f750d" + "br-sao" = "r042-93c1a769-c138-4765-91d2-5796965b6a98" + "ca-tor" = "r038-9448213f-22ce-4a6a-b6b0-22dd6ed9fbb3" + "ca-mon" = "r058-b3211406-9eec-4148-aafb-d6ab7c26a6eb" + } + } +} diff --git a/modules/deployer/locals.tf b/modules/deployer/locals.tf new file mode 100644 index 00000000..140240bc --- /dev/null +++ b/modules/deployer/locals.tf @@ -0,0 +1,90 @@ +# define variables +locals { + name = var.scheduler == "LSF" ? "LSF" : (var.scheduler == "Scale" ? "Scale" : (var.scheduler == "HPCaaS" ? "HPCaaS" : (var.scheduler == "Symphony" ? "Symphony" : (var.scheduler == "Slurm" ? "Slurm" : "")))) + prefix = var.prefix + tags = [local.prefix, local.name] + region = join("-", slice(split("-", var.zones[0]), 0, 2)) + + schematics_reserved_cidrs = [ + "169.44.0.0/14", + "169.60.0.0/14", + "158.175.0.0/16", + "158.176.0.0/15", + "141.125.0.0/16", + "161.156.0.0/16", + "149.81.0.0/16", + "159.122.111.224/27", + "150.238.230.128/27", + "169.55.82.128/27" + ] + bastion_sg_variable_cidr = distinct(flatten([ + local.schematics_reserved_cidrs, + var.allowed_cidr, + var.cluster_cidr + ])) + + enable_deployer = var.enable_deployer + + bastion_node_name = format("%s-%s", local.prefix, "bastion") + deployer_node_name = format("%s-%s", local.prefix, "deployer") + + bastion_image_id = data.ibm_is_image.bastion.id + + # deployer_image_id = data.ibm_is_image.deployer[0].id + # Check whether an entry is found in the mapping file for the given deployer node image + deployer_image_found_in_map = contains(keys(local.image_region_map), var.deployer_instance["image"]) + # If not found, assume the name is the id already (customer provided image) + new_deployer_image_id = local.deployer_image_found_in_map ? local.image_region_map[var.deployer_instance["image"]][local.region] : "Image not found with the given name" + + bastion_ssh_keys = [for name in var.ssh_keys : data.ibm_is_ssh_key.bastion[name].id] + + # Scale static configs + # scale_cloud_deployer_path = "/opt/IBM/ibm-spectrumscale-cloud-deploy" + # scale_cloud_infra_repo_url = "https://github.com/IBM/ibm-spectrum-scale-install-infra" + # scale_cloud_infra_repo_name = "ibm-spectrum-scale-install-infra" + # scale_cloud_infra_repo_tag = "ibmcloud_v2.6.0" + + # LSF static configs + # lsf_cloud_deployer_path = "/opt/ibm/lsf" + + # Security group rules + # TODO: Fix SG rules + bastion_security_group_rules = flatten([ + [for cidr in local.bastion_sg_variable_cidr : { + name = format("allow-variable-inbound-%s", index(local.bastion_sg_variable_cidr, cidr) + 1) + direction = "inbound" + remote = cidr + }], + + # Conditional SG ID inbound rule (added only if condition is met) + var.existing_bastion_security_group_id != null ? [{ + name = "allow-sg-id-inbound" + direction = "inbound" + remote = var.existing_bastion_security_group_id # The source security group ID + }] : [], + + [for cidr in concat(local.bastion_sg_variable_cidr, ["0.0.0.0/0"]) : { + name = format("allow-variable-outbound-%s", index(concat(local.bastion_sg_variable_cidr, ["0.0.0.0/0"]), cidr) + 1) + direction = "outbound" + remote = cidr + }] + ]) + + # Derived configs + # VPC + # resource_group_id = data.ibm_resource_group.existing_resource_group.id + + # Subnets + bastion_subnets = var.bastion_subnets +} + +locals { + vsi_interfaces = ["eth0", "eth1"] + compute_interfaces = local.vsi_interfaces[0] + compute_dns_domain = var.dns_domain_names["compute"] +} + +locals { + public_gateways_list = var.ext_vpc_name != null ? data.ibm_is_public_gateways.public_gateways[0].public_gateways : [] + zone_1_pgw_ids = var.ext_vpc_name != null ? [for gateway in local.public_gateways_list : gateway.id if gateway.vpc == var.vpc_id && gateway.zone == var.zones[0]] : [] +} diff --git a/modules/deployer/main.tf b/modules/deployer/main.tf new file mode 100644 index 00000000..adf15701 --- /dev/null +++ b/modules/deployer/main.tf @@ -0,0 +1,76 @@ +resource "ibm_is_subnet_public_gateway_attachment" "zone_1_attachment" { + count = (var.ext_vpc_name != null && var.ext_cluster_subnet_id == null) ? 1 : 0 + subnet = var.cluster_subnets[0].id + public_gateway = length(local.zone_1_pgw_ids) > 0 ? local.zone_1_pgw_ids[0] : "" +} + +resource "ibm_is_subnet_public_gateway_attachment" "bastion_attachment" { + count = (var.ext_vpc_name != null && var.ext_login_subnet_id == null) ? 1 : 0 + subnet = local.bastion_subnets[0].id + public_gateway = length(local.zone_1_pgw_ids) > 0 ? local.zone_1_pgw_ids[0] : "" +} + +module "ssh_key" { + count = var.enable_deployer ? 1 : 0 + source = "./../key" + private_key_path = "bastion_id_rsa" #checkov:skip=CKV_SECRET_6 +} + +module "bastion_sg" { + count = var.enable_deployer ? 1 : 0 + source = "terraform-ibm-modules/security-group/ibm" + version = "2.6.2" + add_ibm_cloud_internal_rules = true + resource_group = var.resource_group + security_group_name = format("%s-bastion-sg", local.prefix) + security_group_rules = local.bastion_security_group_rules + vpc_id = var.vpc_id +} + +module "bastion_vsi" { + count = (var.enable_deployer && var.bastion_instance_name == null) ? 1 : 0 + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = 1 + create_security_group = false + security_group = null + image_id = local.bastion_image_id + machine_type = var.bastion_instance["profile"] + prefix = local.bastion_node_name + resource_group_id = var.resource_group + enable_floating_ip = true + security_group_ids = module.bastion_sg[*].security_group_id + ssh_key_ids = local.bastion_ssh_keys + subnets = local.bastion_subnets + tags = local.tags + user_data = data.template_file.bastion_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = true + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid +} + +module "deployer_vsi" { + count = local.enable_deployer ? 1 : 0 + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = 1 + create_security_group = false + security_group = null + image_id = local.deployer_image_found_in_map ? local.new_deployer_image_id : data.ibm_is_image.deployer[0].id + machine_type = var.deployer_instance["profile"] + prefix = local.deployer_node_name + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.bastion_sg[*].security_group_id + ssh_key_ids = local.bastion_ssh_keys + subnets = local.bastion_subnets + tags = local.tags + user_data = data.template_file.deployer_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = var.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid +} diff --git a/modules/bootstrap/outputs.tf b/modules/deployer/outputs.tf similarity index 61% rename from modules/bootstrap/outputs.tf rename to modules/deployer/outputs.tf index dd9c746d..f7ed216b 100644 --- a/modules/bootstrap/outputs.tf +++ b/modules/deployer/outputs.tf @@ -1,3 +1,13 @@ +output "bastion_vsi_data" { + description = "Bastion VSI data" + value = module.bastion_vsi[*] +} + +output "deployer_vsi_data" { + description = "Deployer VSI data" + value = module.deployer_vsi[*] +} + output "bastion_primary_ip" { description = "Bastion primary IP" value = var.bastion_instance_name != null && var.bastion_instance_public_ip != null ? data.ibm_is_instance.bastion_instance_name[0].primary_network_interface[0].primary_ip[0].address : one(module.bastion_vsi[*]["fip_list"][0]["ipv4_address"]) @@ -5,17 +15,17 @@ output "bastion_primary_ip" { output "bastion_fip" { description = "Bastion FIP" - value = var.bastion_instance_public_ip != null && var.bastion_instance_name != null ? [var.bastion_instance_public_ip] : module.bastion_vsi[*]["fip_list"][0]["floating_ip"] + value = var.bastion_instance_public_ip != null && var.bastion_instance_name != null ? var.bastion_instance_public_ip : one(module.bastion_vsi[*]["fip_list"][0]["floating_ip"]) } output "bastion_fip_id" { description = "Bastion FIP ID" - value = var.bastion_instance_name != null && var.bastion_instance_public_ip != null ? null : one(module.bastion_vsi[*]["fip_list"][0]["floating_ip_id"]) + value = var.bastion_instance_name != null && var.bastion_instance_public_ip != null ? null : module.bastion_vsi[*]["fip_list"][0]["floating_ip_id"] } output "bastion_security_group_id" { description = "Bastion SG" - value = var.bastion_security_group_id != null ? var.bastion_security_group_id : one(module.bastion_sg[*].security_group_id) + value = one(module.bastion_sg[*].security_group_id) } output "bastion_public_key_content" { @@ -24,6 +34,11 @@ output "bastion_public_key_content" { value = one(module.ssh_key[*].public_key_content) } +output "deployer_ip" { + description = "Deployer IP" + value = one(module.deployer_vsi[*]["list"][0]["ipv4_address"]) +} + output "bastion_private_key_content" { description = "Bastion private key content" sensitive = true diff --git a/modules/deployer/template_files.tf b/modules/deployer/template_files.tf new file mode 100644 index 00000000..8939cee2 --- /dev/null +++ b/modules/deployer/template_files.tf @@ -0,0 +1,15 @@ +data "template_file" "bastion_user_data" { + template = file("${path.module}/templates/bastion_user_data.tpl") + vars = { + ssh_public_key_content = var.enable_deployer ? module.ssh_key[0].public_key_content : "" + } +} + +data "template_file" "deployer_user_data" { + template = file("${path.module}/templates/deployer_user_data.tpl") + vars = { + bastion_public_key_content = var.enable_deployer ? module.ssh_key[0].public_key_content : "" + compute_dns_domain = var.enable_deployer ? local.compute_dns_domain : "" + compute_interfaces = var.enable_deployer ? local.compute_interfaces : "" + } +} diff --git a/modules/bootstrap/templates/bastion_user_data.tpl b/modules/deployer/templates/bastion_user_data.tpl similarity index 100% rename from modules/bootstrap/templates/bastion_user_data.tpl rename to modules/deployer/templates/bastion_user_data.tpl diff --git a/modules/deployer/templates/deployer_user_data.tpl b/modules/deployer/templates/deployer_user_data.tpl new file mode 100644 index 00000000..ebaacb88 --- /dev/null +++ b/modules/deployer/templates/deployer_user_data.tpl @@ -0,0 +1,51 @@ +#!/usr/bin/bash + +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/usr/bin/env bash +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys +echo "DOMAIN=${compute_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 20 +systemctl restart NetworkManager + +# input parameters +echo "${bastion_public_key_content}" >> /home/$USER/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> /home/$USER/.ssh/config +echo "StrictHostKeyChecking no" >> ~/.ssh/config + +# # setup env +# # TODO: Conditional installation (python3, terraform & ansible) +# if grep -E -q "CentOS|Red Hat" /etc/os-release +# then +# # TODO: Terraform Repo access +# #yum install -y yum-utils +# #yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo +# #if (which terraform); then echo "Terraform exists, skipping the installation"; else (yum install -y terraform +# if (which python3); then echo "Python3 exists, skipping the installation"; else (yum install -y python38); fi +# if (which ansible-playbook); then echo "Ansible exists, skipping the installation"; else (yum install -y ansible); fi +# elif grep -q "Ubuntu" /etc/os-release +# then +# apt update +# # TODO: Terraform Repo access +# #apt-get update && sudo apt-get install -y gnupg software-properties-common +# #wget -O- https://apt.releases.hashicorp.com/gpg | gpg --dearmor | tee /usr/share/keyrings/hashicorp-archive-keyring.gpg +# #gpg --no-default-keyring --keyring /usr/share/keyrings/hashicorp-archive-keyring.gpg --fingerprint +# apt install software-properties-common +# apt-add-repository --yes --update ppa:ansible/ansible +# if (which python3); then echo "Python3 exists, skipping the installation"; else (apt install python38); fi +# if (which ansible-playbook); then echo "Ansible exists, skipping the installation"; else (apt install ansible); fi +# fi + +# TODO: run terraform diff --git a/modules/bootstrap/variables.tf b/modules/deployer/variables.tf similarity index 54% rename from modules/bootstrap/variables.tf rename to modules/deployer/variables.tf index d40e75cc..e1c9fb2e 100644 --- a/modules/bootstrap/variables.tf +++ b/modules/deployer/variables.tf @@ -31,16 +31,66 @@ variable "vpc_id" { description = "ID of an existing VPC in which the cluster resources will be deployed." } -variable "network_cidr" { - description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." - type = list(string) +variable "ext_vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "cluster_cidr" { + description = "Network CIDR of the VPC. This is used to manage network security rules for cluster provisioning." + type = string + default = "10.241.0.0/18" +} + +variable "cluster_subnets" { + type = list(object({ + name = string + id = string + zone = string + cidr = string + })) + default = [] + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "ext_login_subnet_id" { + type = string + default = null + description = "Name of an existing subnets in which the bastion and cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "ext_cluster_subnet_id" { + type = string + default = null + description = "Name of an existing subnets in which the bastion and cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +############################################################################## +# Offering Variations +############################################################################## +variable "scheduler" { + type = string default = null + description = "Select one of the scheduler (LSF/Symphony/Slurm/null)" } ############################################################################## # Access Variables ############################################################################## +variable "bastion_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "ibm-ubuntu-22-04-5-minimal-amd64-3" + profile = "cx2-4x8" + } + description = "Configuration for the Bastion node, including the image and instance profile. Only Ubuntu stock images are supported." +} + variable "bastion_subnets" { type = list(object({ name = string @@ -52,6 +102,27 @@ variable "bastion_subnets" { description = "Subnets to launch the bastion host." } +############################################################################## +# Deployer Variables +############################################################################## +variable "enable_deployer" { + type = bool + default = false + description = "Deployer should be only used for better deployment performance." +} + +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + variable "ssh_keys" { type = list(string) description = "The key pair to use to access the host." @@ -60,7 +131,7 @@ variable "ssh_keys" { variable "allowed_cidr" { description = "Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster." type = list(string) - default = ["10.0.0.0/8"] + default = [] } # TODO: landing-zone-vsi limitation to opt out encryption @@ -84,10 +155,19 @@ variable "existing_kms_instance_guid" { variable "skip_iam_authorization_policy" { type = bool - default = false + default = true description = "Set to false if authorization policy is required for VPC block storage volumes to access kms. This can be set to true if authorization policy already exists. For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." } +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + }) + description = "IBM Cloud HPC DNS domain names." +} + ########################################################################### # Existing Bastion Support variables ########################################################################### @@ -104,18 +184,13 @@ variable "bastion_instance_public_ip" { description = "Bastion instance public ip address." } -variable "bastion_security_group_id" { +variable "existing_bastion_security_group_id" { type = string default = null - description = "Bastion security group id." + description = "Existing bastion security group id." } -########################################################################### -# LDAP Server variables -########################################################################### - -variable "ldap_server" { - type = string - default = "null" - description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." +variable "zones" { + description = "Region where VPC will be created. To find your VPC region, use `ibmcloud is regions` command to find available regions." + type = list(string) } diff --git a/modules/bootstrap/version.tf b/modules/deployer/version.tf similarity index 84% rename from modules/bootstrap/version.tf rename to modules/deployer/version.tf index 6127d54e..e44fa0a2 100644 --- a/modules/bootstrap/version.tf +++ b/modules/deployer/version.tf @@ -3,7 +3,7 @@ terraform { required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" } template = { source = "hashicorp/template" diff --git a/modules/dns/datasource.tf b/modules/dns/datasource.tf new file mode 100644 index 00000000..7261158b --- /dev/null +++ b/modules/dns/datasource.tf @@ -0,0 +1,9 @@ +# data "ibm_dns_zones" "conditional" { +# count = var.dns_instance_id != null ? 1 : 0 +# instance_id = var.dns_instance_id +# } + +data "ibm_dns_zones" "dns_zones" { + instance_id = local.dns_instance_id + depends_on = [ibm_dns_zone.dns_zone] +} diff --git a/modules/dns/locals.tf b/modules/dns/locals.tf new file mode 100644 index 00000000..110369b9 --- /dev/null +++ b/modules/dns/locals.tf @@ -0,0 +1,9 @@ +locals { + # dns_domain_names = flatten([setsubtract(var.dns_domain_names == null ? [] : var.dns_domain_names, flatten(data.ibm_dns_zones.conditional[*].dns_zones[*]["name"]))]) + + dns_zone_maps = [for zone in data.ibm_dns_zones.dns_zones.dns_zones : { + (zone["name"]) = zone["zone_id"] + } if contains(var.dns_domain_names, zone["name"])] + + dns_instance_id = var.dns_instance_id == null ? ibm_resource_instance.resource_instance[0].guid : var.dns_instance_id +} diff --git a/modules/dns/main.tf b/modules/dns/main.tf index 916d0077..603cd9ae 100644 --- a/modules/dns/main.tf +++ b/modules/dns/main.tf @@ -1,31 +1,20 @@ -resource "ibm_resource_instance" "itself" { +resource "ibm_resource_instance" "resource_instance" { count = var.dns_instance_id == null ? 1 : 0 name = format("%s-dns-instance", var.prefix) resource_group_id = var.resource_group_id location = "global" service = "dns-svcs" plan = "standard-dns" - tags = local.tags } -locals { - dns_instance_id = var.dns_instance_id == null ? ibm_resource_instance.itself[0].guid : var.dns_instance_id -} - -locals { - name = "lsf" - prefix = var.prefix - tags = [local.prefix, local.name] -} - -resource "ibm_dns_custom_resolver" "itself" { +resource "ibm_dns_custom_resolver" "dns_custom_resolver" { count = var.dns_custom_resolver_id == null ? 1 : 0 name = format("%s-custom-resolver", var.prefix) instance_id = local.dns_instance_id enabled = true high_availability = length(var.subnets_crn) > 1 ? true : false dynamic "locations" { - for_each = length(var.subnets_crn) > 3 ? slice(var.subnets_crn, 0, 2) : var.subnets_crn + for_each = length(var.subnets_crn) > 3 ? slice(var.subnets_crn, 0, 3) : var.subnets_crn content { subnet_crn = locations.value enabled = true @@ -33,27 +22,16 @@ resource "ibm_dns_custom_resolver" "itself" { } } -resource "ibm_dns_zone" "itself" { - count = 1 +resource "ibm_dns_zone" "dns_zone" { + count = length(var.dns_domain_names) instance_id = local.dns_instance_id - name = var.dns_domain_names[0] -} - -data "ibm_dns_zones" "itself" { - instance_id = local.dns_instance_id - depends_on = [ibm_dns_zone.itself] -} - -locals { - dns_zone_maps = [for zone in data.ibm_dns_zones.itself.dns_zones : { - (zone["name"]) = zone["zone_id"] - } if contains(var.dns_domain_names, zone["name"])] + name = var.dns_domain_names[count.index] } -resource "ibm_dns_permitted_network" "itself" { - count = 1 +resource "ibm_dns_permitted_network" "dns_permitted_network" { + count = length(var.dns_domain_names) instance_id = local.dns_instance_id vpc_crn = var.vpc_crn - zone_id = split("/", ibm_dns_zone.itself[0].id)[1] + zone_id = one(values(local.dns_zone_maps[count.index])) type = "vpc" } diff --git a/modules/dns/outputs.tf b/modules/dns/outputs.tf index 23937e1e..a137bb52 100644 --- a/modules/dns/outputs.tf +++ b/modules/dns/outputs.tf @@ -3,6 +3,11 @@ output "dns_instance_id" { value = local.dns_instance_id } +output "dns_custom_resolver_id" { + description = "DNS custom resolver ID" + value = var.dns_custom_resolver_id == null ? one(ibm_dns_custom_resolver.dns_custom_resolver[*].id) : var.dns_custom_resolver_id +} + output "dns_zone_maps" { description = "DNS zones" value = local.dns_zone_maps diff --git a/modules/dns/version.tf b/modules/dns/version.tf index 124b3869..3ce71d7a 100644 --- a/modules/dns/version.tf +++ b/modules/dns/version.tf @@ -3,7 +3,7 @@ terraform { required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" } } } diff --git a/modules/dns_record/datasource.tf b/modules/dns_record/datasource.tf new file mode 100644 index 00000000..748dbfb8 --- /dev/null +++ b/modules/dns_record/datasource.tf @@ -0,0 +1,3 @@ +data "ibm_dns_zones" "dns_zones" { + instance_id = var.dns_instance_id +} diff --git a/modules/dns_record/locals.tf b/modules/dns_record/locals.tf new file mode 100644 index 00000000..ac7ff13c --- /dev/null +++ b/modules/dns_record/locals.tf @@ -0,0 +1,5 @@ +locals { + dns_domain_name = [ + for zone in data.ibm_dns_zones.dns_zones.dns_zones : zone["name"] if zone["zone_id"] == var.dns_zone_id + ] +} diff --git a/modules/dns_record/main.tf b/modules/dns_record/main.tf index a5755e7b..8c9c89e2 100644 --- a/modules/dns_record/main.tf +++ b/modules/dns_record/main.tf @@ -7,9 +7,6 @@ resource "ibm_dns_resource_record" "a" { rdata = var.dns_records[count.index]["rdata"] ttl = 300 } -########################### -# TODO: on line number30 update the var.dns_domain_names to pick up existing domain name when we support scale/protocol domain names -########################## resource "ibm_dns_resource_record" "ptr" { count = length(var.dns_records) @@ -17,8 +14,7 @@ resource "ibm_dns_resource_record" "ptr" { zone_id = var.dns_zone_id type = "PTR" name = var.dns_records[count.index]["rdata"] - rdata = format("%s.%s", var.dns_records[count.index]["name"], var.dns_domain_names["compute"]) - #rdata = format("%s.%s", var.dns_records[count.index]["name"], one(local.dns_domain_name)) - ttl = 300 - depends_on = [ibm_dns_resource_record.a] + rdata = format("%s.%s", var.dns_records[count.index]["name"], one(local.dns_domain_name)) + ttl = 300 + depends_on = [ibm_dns_resource_record.a] } diff --git a/modules/dns_record/variables.tf b/modules/dns_record/variables.tf index 473e760c..4d287154 100644 --- a/modules/dns_record/variables.tf +++ b/modules/dns_record/variables.tf @@ -22,15 +22,3 @@ variable "dns_records" { default = null description = "IBM Cloud HPC DNS record." } - -variable "dns_domain_names" { - type = object({ - compute = string - #storage = string - #protocol = string - }) - default = { - compute = "comp.com" - } - description = "IBM Cloud HPC DNS domain names." -} diff --git a/modules/dns_record/version.tf b/modules/dns_record/version.tf index 124b3869..3ce71d7a 100644 --- a/modules/dns_record/version.tf +++ b/modules/dns_record/version.tf @@ -3,7 +3,7 @@ terraform { required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" } } } diff --git a/modules/file_storage/locals.tf b/modules/file_storage/locals.tf deleted file mode 100644 index 56b4e84e..00000000 --- a/modules/file_storage/locals.tf +++ /dev/null @@ -1,5 +0,0 @@ -locals { - name = "lsf" - prefix = var.prefix - tags = [local.prefix, local.name] -} diff --git a/modules/file_storage/main.tf b/modules/file_storage/main.tf index a6340a07..b91a0c07 100644 --- a/modules/file_storage/main.tf +++ b/modules/file_storage/main.tf @@ -1,15 +1,13 @@ resource "ibm_is_share" "share" { count = var.file_shares != null ? length(var.file_shares) : 0 name = format("%s-fs", var.file_shares[count.index]["name"]) + resource_group = var.resource_group_id access_control_mode = var.security_group_ids != null ? "security_group" : "vpc" size = var.file_shares[count.index]["size"] profile = "dp2" iops = var.file_shares[count.index]["iops"] zone = var.zone encryption_key = var.encryption_key_crn - resource_group = var.resource_group - tags = local.tags - depends_on = [time_sleep.wait_for_authorization_policy] } resource "ibm_iam_authorization_policy" "policy" { @@ -22,8 +20,7 @@ resource "ibm_iam_authorization_policy" "policy" { } resource "time_sleep" "wait_for_authorization_policy" { - depends_on = [ibm_iam_authorization_policy.policy[0]] - + depends_on = [ibm_iam_authorization_policy.policy[0]] create_duration = "30s" } @@ -46,5 +43,6 @@ resource "ibm_is_share_mount_target" "share_target_sg" { name = format("%s-fs-vni", var.file_shares[count.index]["name"]) security_groups = var.security_group_ids } - #transit_encryption = "user_managed" + # TODO: update transit_encryption value conditionaly; it fails with + # transit_encryption = "user_managed" } diff --git a/modules/file_storage/outputs.tf b/modules/file_storage/outputs.tf index 08ec4741..a66961ae 100644 --- a/modules/file_storage/outputs.tf +++ b/modules/file_storage/outputs.tf @@ -6,6 +6,17 @@ output "mount_path" { ]) } +output "name_mount_path_map" { + description = "Mount path name and its path map" + value = { + for mount_details in flatten([ + ibm_is_share_mount_target.share_target_vpc, + ibm_is_share_mount_target.share_target_sg + ]) : + mount_details.name => mount_details.mount_path + } +} + output "mount_paths_info" { description = "Information about mount paths" value = { @@ -31,23 +42,3 @@ output "total_mount_paths" { description = "Total Mount paths" value = ibm_is_share_mount_target.share_target_sg[*].mount_path } - -#output "mount_paths_excluding_first" { -# description = "Mount paths excluding the first element" -# value = ibm_is_share_mount_target.share_target_vpc[*].mount_path[1:] -#} - -#output "mount_paths_excluding_first" { -# description = "Mount paths excluding the first element" -# value = length(ibm_is_share_mount_target.share_target_sg[*].mount_path) > 1 ? slice(ibm_is_share_mount_target.share_target_sg[*].mount_path, 1, length(ibm_is_share_mount_target.share_target_sg[*].mount_path) - 1) : [] -#} - -#output "mount_paths_excluding_first" { -# description = "Mount paths excluding the first element" -# value = length(ibm_is_share_mount_target.share_target_sg[*].mount_path) > 1 ? slice(ibm_is_share_mount_target.share_target_sg[*].mount_path, 1, length(ibm_is_share_mount_target.share_target_sg[*].mount_path) - 1) : [] -#} - -#output "mount_paths_excluding_first" { -# description = "Mount paths excluding the first element" -# value = length(ibm_is_share_mount_target.share_target_sg[*].mount_path) > 1 ? tail(ibm_is_share_mount_target.share_target_sg[*].mount_path, length(ibm_is_share_mount_target.share_target_sg[*].mount_path) - 1) : [] -#} diff --git a/modules/file_storage/variables.tf b/modules/file_storage/variables.tf index 9de6e76b..3d1e598c 100644 --- a/modules/file_storage/variables.tf +++ b/modules/file_storage/variables.tf @@ -3,11 +3,7 @@ variable "zone" { type = string } -############################################################################## -# Resource Groups Variables -############################################################################## - -variable "resource_group" { +variable "resource_group_id" { description = "String describing resource groups to create or reference" type = string default = null @@ -31,6 +27,24 @@ variable "encryption_key_crn" { default = null } +variable "existing_kms_instance_guid" { + type = string + default = null + description = "GUID of boot volume encryption key" +} + +variable "skip_iam_share_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the VPC file share. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." +} + +variable "kms_encryption_enabled" { + description = "Enable Key management" + type = bool + default = true +} + variable "vpc_id" { type = string default = null @@ -48,26 +62,3 @@ variable "subnet_id" { description = "Subnet ID to mount file share" default = null } - -variable "prefix" { - description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." - type = string -} - -variable "existing_kms_instance_guid" { - type = string - default = null - description = "GUID of boot volume encryption key" -} - -variable "skip_iam_share_authorization_policy" { - type = bool - default = false - description = "Set it to false if authorization policy is required for VPC file share to access kms. This can be set to true if authorization policy already exists. For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." -} - -variable "kms_encryption_enabled" { - description = "Enable Key management" - type = bool - default = true -} diff --git a/modules/file_storage/version.tf b/modules/file_storage/version.tf index a1a677f0..b2c90592 100644 --- a/modules/file_storage/version.tf +++ b/modules/file_storage/version.tf @@ -3,11 +3,11 @@ terraform { required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" } time = { source = "hashicorp/time" - version = ">=0.11.2" + version = ">= 0.9.1, < 1.0.0" } } } diff --git a/modules/inventory/main.tf b/modules/inventory/main.tf index f2a2697f..862149b4 100644 --- a/modules/inventory/main.tf +++ b/modules/inventory/main.tf @@ -1,4 +1,66 @@ -resource "local_sensitive_file" "itself" { - content = join("\n", concat([var.server_name, var.user], var.hosts)) +locals { + ldap_server_inventory = format("%s/ldap_server_inventory.ini", var.playbooks_path) +} + +resource "local_sensitive_file" "mount_path_file" { + content = < [terraform](#requirement\_terraform) | >= 1.9.0 | -| [ibm](#requirement\_ibm) | >= 1.56.2 | +| [http](#requirement\_http) | 3.2.1 | +| [ibm](#requirement\_ibm) | >= 1.53.0 | ## Providers -| Name | Version | -|------|---------| -| [ibm](#provider\_ibm) | >= 1.56.2 | +No providers. ## Modules | Name | Source | Version | |------|--------|---------| -| [landing\_zone](#module\_landing\_zone) | terraform-ibm-modules/landing-zone/ibm | 6.6.3 | +| [landing-zone](#module\_landing-zone) | terraform-ibm-modules/landing-zone/ibm | 4.5.5 | ## Resources -| Name | Type | -|------|------| -| [ibm_is_subnet.subnet](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_subnet) | data source | -| [ibm_is_vpc.itself](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_vpc) | data source | -| [ibm_kms_key.kms_key](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/kms_key) | data source | -| [ibm_resource_instance.kms_instance](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/resource_instance) | data source | +No resources. ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `list(string)` |
[
"10.0.0.0/24"
]
| no | -| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `list(string)` |
[
"10.10.20.0/24",
"10.20.20.0/24",
"10.30.20.0/24"
]
| no | -| [cos\_expiration\_days](#input\_cos\_expiration\_days) | Specify the number of days after object creation to expire objects in COS buckets. | `number` | `30` | no | +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` |
[
"10.0.0.0/8"
]
| no | +| [deployer\_ssh\_keys](#input\_deployer\_ssh\_keys) | The key pair to use to access the deployer host. | `list(string)` | n/a | yes | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | n/a | yes | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `list(string)` |
[
"10.10.10.0/24",
"10.20.10.0/24",
"10.30.10.0/24"
]
| no | | [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | -| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker on COS | `bool` | `true` | no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | deployer should be only used for better deployment performance | `bool` | `false` | no | | [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | -| [enable\_landing\_zone](#input\_enable\_landing\_zone) | Run landing zone module. | `bool` | `true` | no | +| [enable\_client](#input\_enable\_client) | The solution supports multiple ways to connect to your HPC cluster for example, using client node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | | [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | -| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | -| [key\_management](#input\_key\_management) | null/key\_protect | `string` | `null` | no | -| [kms\_instance\_name](#input\_kms\_instance\_name) | Name of the Key Protect instance associated with the Key Management Service. The ID can be found under the details of the KMS, see [View key-protect ID](https://cloud.ibm.com/docs/key-protect?topic=key-protect-retrieve-instance-ID&interface=ui). | `string` | `null` | no | -| [kms\_key\_name](#input\_kms\_key\_name) | Provide the existing KMS encryption key name that you want to use for the IBM Cloud HPC cluster. (for example kms\_key\_name: my-encryption-key). | `string` | `null` | no | -| [login\_subnet\_id](#input\_login\_subnet\_id) | List of existing subnet ID under the VPC, where the login/Bastion server will be provisioned. | `string` | `null` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using client node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | `null` | no | +| [key\_management](#input\_key\_management) | null/key\_protect/hs\_crypto | `string` | `null` | no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to access the client host. | `list(string)` | n/a | yes | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `list(string)` |
[
"10.0.0.0/24"
]
| no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. | `number` | `3` | no | +| [max\_compute\_instances](#input\_max\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. | `number` | `250` | no | +| [min\_compute\_instances](#input\_min\_compute\_instances) | Min Number of instances to be launched for compute cluster. | `number` | `0` | no | | [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | -| [no\_addr\_prefix](#input\_no\_addr\_prefix) | Set it as true, if you don't want to create address prefixes. | `bool` | n/a | yes | -| [observability\_logs\_enable](#input\_observability\_logs\_enable) | Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management/Compute Nodes will be ingested under COS bucket. | `bool` | `false` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | | [prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | n/a | yes | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. | `number` | `2` | no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `list(string)` |
[
"10.10.30.0/24",
"10.20.30.0/24",
"10.30.30.0/24"
]
| no | | [resource\_group](#input\_resource\_group) | String describing resource groups to create or reference | `string` | `null` | no | -| [scc\_enable](#input\_scc\_enable) | Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created. | `bool` | `false` | no | -| [skip\_flowlogs\_s2s\_auth\_policy](#input\_skip\_flowlogs\_s2s\_auth\_policy) | Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account. | `bool` | `false` | no | -| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the servers. | `list(string)` | n/a | yes | -| [subnet\_id](#input\_subnet\_id) | List of existing subnet IDs under the VPC, where the cluster will be provisioned. | `list(string)` | `null` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. | `number` | `3` | no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | n/a | yes | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `list(string)` |
[
"10.10.20.0/24",
"10.20.20.0/24",
"10.30.20.0/24"
]
| no | | [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | | [zones](#input\_zones) | Region where VPC will be created. To find your VPC region, use `ibmcloud is regions` command to find available regions. | `list(string)` | n/a | yes | ## Outputs -| Name | Description | -|------|-------------| -| [bastion\_subnets](#output\_bastion\_subnets) | Bastion subnets | -| [boot\_volume\_encryption\_key](#output\_boot\_volume\_encryption\_key) | Boot volume encryption key | -| [compute\_subnets](#output\_compute\_subnets) | Compute subnets | -| [cos\_buckets\_data](#output\_cos\_buckets\_data) | COS buckets data | -| [cos\_buckets\_names](#output\_cos\_buckets\_names) | Name of the COS Bucket created for SCC Instance | -| [cos\_instance\_crns](#output\_cos\_instance\_crns) | CRN of the COS instance created by Landing Zone Module | -| [key\_management\_guid](#output\_key\_management\_guid) | GUID for KMS instance | -| [login\_subnets](#output\_login\_subnets) | Login subnets | -| [protocol\_subnets](#output\_protocol\_subnets) | Protocol subnets | -| [public\_gateways](#output\_public\_gateways) | Public Gateway IDs | -| [resource\_group\_id](#output\_resource\_group\_id) | Resource group ID | -| [storage\_subnets](#output\_storage\_subnets) | Storage subnets | -| [subnets](#output\_subnets) | subnets | -| [subnets\_crn](#output\_subnets\_crn) | Subnets crn | -| [vpc\_cidr](#output\_vpc\_cidr) | To fetch the vpc cidr | -| [vpc\_crn](#output\_vpc\_crn) | VPC CRN | -| [vpc\_id](#output\_vpc\_id) | VPC ID | -| [vpc\_name](#output\_vpc\_name) | VPC name | +No outputs. diff --git a/modules/landing_zone/datasource.tf b/modules/landing_zone/datasource.tf index 27fa72ea..028278ec 100644 --- a/modules/landing_zone/datasource.tf +++ b/modules/landing_zone/datasource.tf @@ -10,12 +10,12 @@ data "ibm_kms_key" "kms_key" { key_name = var.kms_key_name } -data "ibm_is_vpc" "itself" { - count = var.vpc == null ? 0 : 1 - name = var.vpc +data "ibm_is_vpc" "existing_vpc" { + count = var.vpc_name != null ? 1 : 0 + name = var.vpc_name } data "ibm_is_subnet" "subnet" { - count = (var.vpc != null && length(var.subnet_id) > 0) ? 1 : 0 - identifier = var.subnet_id[count.index] + count = (var.vpc_name != null && length(var.compute_subnet_id) > 0) ? 1 : 0 + identifier = var.compute_subnet_id } diff --git a/modules/landing_zone/locals.tf b/modules/landing_zone/locals.tf index 54c3a0f6..0c045ef9 100644 --- a/modules/landing_zone/locals.tf +++ b/modules/landing_zone/locals.tf @@ -3,25 +3,13 @@ locals { name = "lsf" prefix = var.prefix tags = [local.prefix, local.name] - # schematics_reserved_cidrs = [ - # "169.44.0.0/14", - # "169.60.0.0/14", - # "158.175.0.0/16", - # "158.176.0.0/15", - # "141.125.0.0/16", - # "161.156.0.0/16", - # "149.81.0.0/16", - # "159.122.111.224/27", - # "150.238.230.128/27", - # "169.55.82.128/27" - # ] # Derived values # Resource group calculation # If user defined then use existing else create new - create_resource_group = var.resource_group == "null" ? true : false - resource_groups = var.resource_group == "null" ? [ + create_resource_group = var.existing_resource_group == "null" ? true : false + resource_groups = var.existing_resource_group == "null" ? [ { name = "${local.prefix}-service-rg", create = local.create_resource_group, @@ -34,19 +22,32 @@ locals { } ] : [ { - name = var.resource_group, + name = var.existing_resource_group, create = local.create_resource_group } ] # For the variables looking for resource group names only (transit_gateway, key_management, atracker) - resource_group = var.resource_group == "null" ? "${local.prefix}-service-rg" : var.resource_group - region = join("-", slice(split("-", var.zones[0]), 0, 2)) - zones = ["zone-1", "zone-2", "zone-3"] + service_resource_group = var.existing_resource_group == "null" ? "${local.prefix}-service-rg" : var.existing_resource_group + + client_instance_count = sum(var.client_instances[*]["count"]) + management_instance_count = sum(var.management_instances[*]["count"]) + static_compute_instance_count = sum(var.compute_instances[*]["count"]) + storage_instance_count = var.storage_type == "persistent" ? sum(var.storage_servers[*]["count"]) : sum(var.storage_instances[*]["count"]) + protocol_instance_count = sum(var.protocol_instances[*]["count"]) + + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) + zones = ["zone-1", "zone-2", "zone-3"] active_zones = [ for zone in var.zones : format("zone-%d", substr(zone, -1, -2)) ] - bastion_sg_variable_cidr_list = split(",", var.network_cidr) + # Future use + #zone_count = length(local.active_zones) + + bastion_sg_variable_cidr_list = split(",", var.cluster_cidr) + + # Address Prefixes calculation address_prefixes = { "zone-${element(split("-", var.zones[0]), 2)}" = [local.bastion_sg_variable_cidr_list[0]] } @@ -54,20 +55,40 @@ locals { # Subnet calculation active_subnets = { for zone in local.zones : zone => contains(local.active_zones, zone) ? [ + local.client_instance_count != 0 ? { + name = "client-subnet-${zone}" + acl_name = "hpc-acl" + cidr = var.client_subnets_cidr[index(local.active_zones, zone)] + public_gateway = true + no_addr_prefix = true + } : null, { name = "compute-subnet-${zone}" acl_name = "hpc-acl" - cidr = var.compute_subnets_cidr[index(local.active_zones, zone)] - public_gateway = var.vpc == null ? true : false - no_addr_prefix = var.no_addr_prefix - + cidr = var.vpc_cluster_private_subnets_cidr_blocks[index(local.active_zones, zone)] + public_gateway = true + no_addr_prefix = true }, + local.storage_instance_count != 0 ? { + name = "storage-subnet-${zone}" + acl_name = "hpc-acl" + cidr = var.storage_subnets_cidr[index(local.active_zones, zone)] + public_gateway = true + no_addr_prefix = true + } : null, + local.storage_instance_count != 0 && local.protocol_instance_count != 0 ? { + name = "protocol-subnet-${zone}" + acl_name = "hpc-acl" + cidr = var.protocol_subnets_cidr[index(local.active_zones, zone)] + public_gateway = true + no_addr_prefix = true + } : null, zone == local.active_zones[0] ? { name = "bastion-subnet" acl_name = "hpc-acl" - cidr = var.bastion_subnets_cidr[0] - public_gateway = false - no_addr_prefix = var.no_addr_prefix + cidr = var.vpc_cluster_login_private_subnets_cidr_blocks + public_gateway = true + no_addr_prefix = true } : null ] : [] } @@ -77,22 +98,28 @@ locals { use_public_gateways = { for zone in local.zones : zone => contains(local.active_zones, zone) ? true : false } + + # VPC calculation + # If user defined then use existing else create new + # Calculate network acl rules (can be done inplace in vpcs) + # TODO: VPN expectation + cidrs_network_acl_rules = compact(flatten(["0.0.0.0/0"])) network_acl_inbound_rules = [ - { - name = "test-1" + for cidr_index in range(length(local.cidrs_network_acl_rules)) : { + name = format("allow-inbound-%s", cidr_index + 1) action = "allow" - destination = "0.0.0.0/0" + destination = var.cluster_cidr direction = "inbound" - source = "0.0.0.0/0" + source = element(local.cidrs_network_acl_rules, cidr_index) } ] network_acl_outbound_rules = [ - { - name = "test-2" + for cidr_index in range(length(local.cidrs_network_acl_rules)) : { + name = format("allow-outbound-%s", cidr_index + 1) action = "allow" - destination = "0.0.0.0/0" + destination = element(local.cidrs_network_acl_rules, cidr_index) direction = "outbound" - source = "0.0.0.0/0" + source = var.cluster_cidr } ] network_acl_rules = flatten([local.network_acl_inbound_rules, local.network_acl_outbound_rules]) @@ -105,19 +132,19 @@ locals { vpcs = [ { - existing_vpc_id = var.vpc == null ? null : data.ibm_is_vpc.itself[0].id - existing_subnets = (var.vpc != null && length(var.subnet_id) > 0) ? [ + existing_vpc_id = var.vpc_name == null ? null : data.ibm_is_vpc.existing_vpc[0].id + existing_subnets = (var.vpc_name != null && length(var.compute_subnet_id) > 0) ? [ { - id = var.subnet_id[0] + id = var.compute_subnet_id public_gateway = false }, { - id = var.login_subnet_id + id = var.bastion_subnet_id public_gateway = false } ] : null prefix = local.name - resource_group = var.resource_group == "null" ? "${local.prefix}-workload-rg" : var.resource_group + resource_group = var.existing_resource_group == "null" ? "${local.prefix}-workload-rg" : var.existing_resource_group clean_default_security_group = true clean_default_acl = true flow_logs_bucket_name = var.enable_vpc_flow_logs ? "vpc-flow-logs-bucket" : null @@ -128,9 +155,9 @@ locals { rules = local.network_acl_rules } ], - subnets = (var.vpc != null && length(var.subnet_id) > 0) ? null : local.subnets - use_public_gateways = var.vpc == null ? local.use_public_gateways : local.use_public_gateways_existing_vpc - address_prefixes = var.vpc == null ? local.address_prefixes : null + subnets = (var.vpc_name != null && length(var.compute_subnet_id) > 0) ? null : local.subnets + use_public_gateways = var.vpc_name == null ? local.use_public_gateways : local.use_public_gateways_existing_vpc + address_prefixes = var.vpc_name == null ? local.address_prefixes : null } ] @@ -140,6 +167,7 @@ locals { name = item } ] + vsi = [] # Define VPN @@ -147,28 +175,30 @@ locals { { name = "vpn-gw" vpc_name = local.name - subnet_name = length(var.subnet_id) == 0 ? "bastion-subnet" : data.ibm_is_subnet.subnet[0].name + subnet_name = length(var.compute_subnet_id) == 0 ? "bastion-subnet" : data.ibm_is_subnet.subnet[0].name mode = "policy" - resource_group = local.resource_group + resource_group = local.service_resource_group } ] : [] # Define transit gateway (to connect multiple VPC) enable_transit_gateway = false - transit_gateway_resource_group = local.resource_group - transit_gateway_connections = [var.vpc] + transit_gateway_global = false + transit_gateway_resource_group = local.service_resource_group + transit_gateway_connections = [var.vpc_name] active_cos = [ ( - var.enable_cos_integration || var.enable_vpc_flow_logs || var.enable_atracker || var.scc_enable || var.observability_logs_enable + var.enable_cos_integration || var.enable_vpc_flow_logs || var.enable_atracker || var.observability_logs_enable ) ? { name = var.cos_instance_name == null ? "hpc-cos" : var.cos_instance_name - resource_group = local.resource_group + resource_group = local.service_resource_group plan = "standard" random_suffix = true use_data = var.cos_instance_name == null ? false : true keys = [] skip_flowlogs_s2s_auth_policy = var.skip_flowlogs_s2s_auth_policy + skip_kms_s2s_auth_policy = var.skip_kms_s2s_auth_policy # Extra bucket for solution specific object storage buckets = [ @@ -187,7 +217,7 @@ locals { force_delete = true kms_key = var.key_management == "key_protect" ? (var.kms_key_name == null ? format("%s-slz-key", var.prefix) : var.kms_key_name) : null expire_rule = { - days = var.cos_expiration_days + days = 30 enable = true rule_id = "bucket-expire-rule" } @@ -199,7 +229,7 @@ locals { force_delete = true kms_key = var.key_management == "key_protect" ? (var.kms_key_name == null ? format("%s-atracker-key", var.prefix) : var.kms_key_name) : null expire_rule = { - days = var.cos_expiration_days + days = 30 enable = true rule_id = "bucket-expire-rule" } @@ -211,7 +241,7 @@ locals { force_delete = true kms_key = var.key_management == "key_protect" ? (var.kms_key_name == null ? format("%s-logs-data-key", var.prefix) : var.kms_key_name) : null expire_rule = { - days = var.cos_expiration_days + days = 30 enable = true rule_id = "bucket-expire-rule" } @@ -223,19 +253,7 @@ locals { force_delete = true kms_key = var.key_management == "key_protect" ? (var.kms_key_name == null ? format("%s-metrics-data-key", var.prefix) : var.kms_key_name) : null expire_rule = { - days = var.cos_expiration_days - enable = true - rule_id = "bucket-expire-rule" - } - } : null, - var.scc_enable ? { - name = "scc-bucket" - storage_class = "standard" - endpoint_type = "public" - force_delete = true - kms_key = var.key_management == "key_protect" ? (var.kms_key_name == null ? format("%s-scc-key", var.prefix) : var.kms_key_name) : null - expire_rule = { - days = var.cos_expiration_days + days = 30 enable = true rule_id = "bucket-expire-rule" } @@ -254,6 +272,7 @@ locals { use_data = instance.use_data keys = instance.keys skip_flowlogs_s2s_auth_policy = instance.skip_flowlogs_s2s_auth_policy + skip_kms_s2s_auth_policy = instance.skip_kms_s2s_auth_policy buckets = [ for bucket in instance.buckets : { @@ -288,9 +307,6 @@ locals { } : null, var.enable_atracker ? { name = format("%s-atracker-key", var.prefix) - } : null, - var.scc_enable ? { - name = format("%s-scc-key", var.prefix) } : null ] : [ { @@ -298,9 +314,10 @@ locals { existing_key_crn = data.ibm_kms_key.kms_key[0].keys[0].crn } ]) : null + key_management = var.key_management == "key_protect" ? { name = var.kms_instance_name != null ? var.kms_instance_name : format("%s-kms", var.prefix) # var.key_management == "hs_crypto" ? var.hpcs_instance_name : format("%s-kms", var.prefix) - resource_group = local.resource_group + resource_group = local.service_resource_group use_hs_crypto = false keys = [for each in local.active_keys : each if each != null] use_data = var.kms_instance_name != null ? true : false @@ -311,50 +328,60 @@ locals { keys = [] use_data = null } + + total_vsis = sum([ + local.management_instance_count, + local.static_compute_instance_count, + local.storage_instance_count, + local.protocol_instance_count + ]) * length(local.active_zones) + placement_groups_count = var.placement_strategy == "host_spread" ? local.total_vsis / 12 : var.placement_strategy == "power_spread" ? local.total_vsis / 4 : 0 + vpc_placement_groups = [ + for placement_group in range(local.placement_groups_count) : { + name = format("%s", placement_group + 1) + resource_group = local.service_resource_group + strategy = var.placement_strategy + } + ] + + # Variables to explore + clusters = coalesce(var.clusters, []) + # Unexplored variables security_groups = [] virtual_private_endpoints = [] service_endpoints = "private" atracker = { - resource_group = local.resource_group - receive_global_events = false + resource_group = local.service_resource_group + receive_global_events = var.enable_atracker collector_bucket_name = "atracker-bucket" add_route = var.enable_atracker ? true : false } - secrets_manager = { - use_secrets_manager = false - } - access_groups = [] - f5_vsi = [] - #add_kms_block_storage_s2s = false - skip_kms_block_storage_s2s_auth_policy = true - clusters = [] - wait_till = "IngressReady" - teleport_vsi = [] - iam_account_settings = { - enable = false + wait_till = "IngressReady" + appid = { + use_appid = false } + teleport_vsi = [] teleport_config_data = { domain = var.prefix } + f5_vsi = [] f5_template_data = { license_type = "none" } - appid = { - use_appid = false - } + skip_kms_block_storage_s2s_auth_policy = true } # env variables (use to override) locals { env = { - #ibmcloud_api_key = var.ibmcloud_api_key resource_groups = local.resource_groups - network_cidr = var.network_cidr + cluster_cidr = var.cluster_cidr vpcs = local.vpcs vpn_gateways = local.vpn_gateways enable_transit_gateway = local.enable_transit_gateway + transit_gateway_global = local.transit_gateway_global transit_gateway_resource_group = local.transit_gateway_resource_group transit_gateway_connections = local.transit_gateway_connections vsi = local.vsi @@ -362,19 +389,17 @@ locals { cos = local.cos key_management = local.key_management atracker = local.atracker + vpc_placement_groups = local.vpc_placement_groups security_groups = local.security_groups virtual_private_endpoints = local.virtual_private_endpoints service_endpoints = local.service_endpoints - skip_kms_block_storage_s2s_auth_policy = local.skip_kms_block_storage_s2s_auth_policy clusters = local.clusters wait_till = local.wait_till - iam_account_settings = local.iam_account_settings - access_groups = local.access_groups - f5_vsi = local.f5_vsi - f5_template_data = local.f5_template_data appid = local.appid teleport_config_data = local.teleport_config_data teleport_vsi = local.teleport_vsi - secrets_manager = local.secrets_manager + f5_vsi = local.f5_vsi + f5_template_data = local.f5_template_data + skip_kms_block_storage_s2s_auth_policy = local.skip_kms_block_storage_s2s_auth_policy } } diff --git a/modules/landing_zone/main.tf b/modules/landing_zone/main.tf index eb759c1f..80688431 100644 --- a/modules/landing_zone/main.tf +++ b/modules/landing_zone/main.tf @@ -1,15 +1,16 @@ module "landing_zone" { count = var.enable_landing_zone ? 1 : 0 source = "terraform-ibm-modules/landing-zone/ibm" - version = "7.4.3" + version = "8.2.0" prefix = local.prefix region = local.region tags = local.tags resource_groups = local.env.resource_groups - network_cidr = local.env.network_cidr + network_cidr = local.env.cluster_cidr vpcs = local.env.vpcs vpn_gateways = local.env.vpn_gateways enable_transit_gateway = local.env.enable_transit_gateway + transit_gateway_global = local.env.transit_gateway_global transit_gateway_resource_group = local.env.transit_gateway_resource_group transit_gateway_connections = local.env.transit_gateway_connections ssh_keys = local.env.ssh_keys @@ -19,13 +20,14 @@ module "landing_zone" { cos = local.env.cos service_endpoints = local.env.service_endpoints key_management = local.env.key_management - skip_kms_block_storage_s2s_auth_policy = local.env.skip_kms_block_storage_s2s_auth_policy atracker = local.env.atracker clusters = local.env.clusters wait_till = local.env.wait_till - f5_vsi = local.env.f5_vsi - f5_template_data = local.env.f5_template_data appid = local.env.appid teleport_config_data = local.env.teleport_config_data teleport_vsi = local.env.teleport_vsi + f5_vsi = local.env.f5_vsi + f5_template_data = local.env.f5_template_data + vpc_placement_groups = local.env.vpc_placement_groups + skip_kms_block_storage_s2s_auth_policy = local.env.skip_kms_block_storage_s2s_auth_policy } diff --git a/modules/landing_zone/outputs.tf b/modules/landing_zone/outputs.tf index 31d8ba0f..37d17a3b 100644 --- a/modules/landing_zone/outputs.tf +++ b/modules/landing_zone/outputs.tf @@ -13,31 +13,14 @@ output "vpc_id" { value = module.landing_zone[*].vpc_data[0].vpc_id } -output "vpc_crn" { - description = "VPC CRN" - value = module.landing_zone[*].vpc_data[0].vpc_crn -} - -output "public_gateways" { - description = "Public Gateway IDs" - value = module.landing_zone[*].vpc_data[0].public_gateways -} - output "vpc_cidr" { description = "To fetch the vpc cidr" value = module.landing_zone[*].vpc_data[0].cidr_blocks[0] } -output "subnets" { - description = "subnets" - value = [for subnet in flatten(module.landing_zone[*].subnet_data) : { - name = subnet["name"] - id = subnet["id"] - zone = subnet["zone"] - cidr = subnet["cidr"] - crn = subnet["crn"] - } - ] +output "vpc_crn" { + description = "VPC CRN" + value = module.landing_zone[*].vpc_data[0].vpc_crn } output "bastion_subnets" { @@ -51,14 +34,14 @@ output "bastion_subnets" { ] } -output "login_subnets" { - description = "Login subnets" +output "client_subnets" { + description = "client subnets" value = [for subnet in flatten(module.landing_zone[*].subnet_data) : { name = subnet["name"] id = subnet["id"] zone = subnet["zone"] cidr = subnet["cidr"] - } if strcontains(subnet["name"], "-lsf-login-subnet") + } if strcontains(subnet["name"], "-lsf-client-subnet") ] } @@ -69,8 +52,6 @@ output "compute_subnets" { id = subnet["id"] zone = subnet["zone"] cidr = subnet["cidr"] - crn = subnet["crn"] - #ipv4_cidr_block = subnet["ipv4_cidr_block "] } if strcontains(subnet["name"], "-lsf-compute-subnet-zone-") ] } @@ -110,7 +91,12 @@ output "boot_volume_encryption_key" { output "key_management_guid" { description = "GUID for KMS instance" - value = var.key_management == "key_protect" ? module.landing_zone[0].key_management_guid : null + value = var.enable_landing_zone ? var.key_management != null ? module.landing_zone[0].key_management_guid : null : null +} + +output "cos_buckets_data" { + description = "COS buckets data" + value = flatten(module.landing_zone[*].cos_bucket_data) } output "cos_instance_crns" { @@ -123,9 +109,4 @@ output "cos_buckets_names" { value = flatten(module.landing_zone[*].cos_bucket_names) } -output "cos_buckets_data" { - description = "COS buckets data" - value = flatten(module.landing_zone[*].cos_bucket_data) -} - # TODO: Observability data diff --git a/modules/landing_zone/variables.tf b/modules/landing_zone/variables.tf index 8042b2a5..2cfc7a32 100644 --- a/modules/landing_zone/variables.tf +++ b/modules/landing_zone/variables.tf @@ -12,7 +12,7 @@ variable "enable_landing_zone" { # Resource Groups Variables ############################################################################## -variable "resource_group" { +variable "existing_resource_group" { description = "String describing resource groups to create or reference" type = string default = null @@ -41,28 +41,22 @@ variable "zones" { # VPC Variables ############################################################################## -variable "vpc" { +variable "vpc_name" { type = string description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" default = null } -variable "subnet_id" { - type = list(string) - default = null - description = "List of existing subnet IDs under the VPC, where the cluster will be provisioned." -} - -variable "login_subnet_id" { +variable "cluster_cidr" { + description = "Network CIDR of the VPC. This is used to manage network security rules for cluster provisioning." type = string - default = null - description = "List of existing subnet ID under the VPC, where the login/Bastion server will be provisioned." + default = "10.241.0.0/18" } -variable "network_cidr" { - description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +variable "placement_strategy" { type = string - default = "10.0.0.0/8" + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" } variable "ssh_keys" { @@ -74,28 +68,145 @@ variable "ssh_keys" { # Access Variables ############################################################################## -variable "bastion_subnets_cidr" { - type = list(string) - default = ["10.0.0.0/24"] - description = "Subnet CIDR block to launch the bastion host." -} - -variable "enable_vpn" { - type = bool - default = false - description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true." +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.0.0.0/24" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." } ############################################################################## # Compute Variables ############################################################################## +variable "client_subnets_cidr" { + type = list(string) + default = ["10.10.10.0/24", "10.20.10.0/24", "10.30.10.0/24"] + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1 + }] + description = "Number of instances to be launched for client." +} -variable "compute_subnets_cidr" { +variable "vpc_cluster_private_subnets_cidr_blocks" { type = list(string) default = ["10.10.20.0/24", "10.20.20.0/24", "10.30.20.0/24"] description = "Subnet CIDR block to launch the compute cluster host." } +variable "bastion_subnet_id" { + type = string + description = "Name of an existing bastion subnet_id in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" + default = null +} + +variable "compute_subnet_id" { + type = string + description = "Name of an existing compute subnet_id in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" + default = null +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "cx2-2x4" + count = 3 + }] + description = "Number of instances to be launched for management." +} + +variable "compute_instances" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + }] + description = "Min Number of instances to be launched for compute cluster." +} + +############################################################################## +# Scale Storage Variables +############################################################################## + +variable "storage_type" { + type = string + default = "scratch" + description = "Select the required storage type(scratch/persistent/eval)." +} + +variable "storage_subnets_cidr" { + type = list(string) + default = ["10.10.30.0/24", "10.20.30.0/24", "10.30.30.0/24"] + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "bx2-2x8" + count = 3 + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "storage_servers" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "cx2d-metal-96x192" + count = 2 + }] + description = "Number of Bareemetal servers to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = list(string) + default = ["10.10.40.0/24", "10.20.40.0/24", "10.30.40.0/24"] + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + }] + description = "Number of instances to be launched for protocol hosts." +} + ############################################################################## # Observability Variables ############################################################################## @@ -115,13 +226,7 @@ variable "cos_instance_name" { variable "enable_atracker" { type = bool default = true - description = "Enable Activity tracker on COS" -} - -variable "cos_expiration_days" { - type = number - default = 30 - description = "Specify the number of days after object creation to expire objects in COS buckets." + description = "Enable Activity tracker" } variable "enable_vpc_flow_logs" { @@ -130,16 +235,6 @@ variable "enable_vpc_flow_logs" { description = "Enable Activity tracker" } -############################################################################## -# SCC Variables -############################################################################## - -variable "scc_enable" { - type = bool - default = false - description = "Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created." -} - ############################################################################## # Encryption Variables ############################################################################## @@ -147,7 +242,7 @@ variable "scc_enable" { variable "key_management" { type = string default = null - description = "null/key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." } variable "kms_instance_name" { @@ -162,19 +257,106 @@ variable "kms_key_name" { description = "Provide the existing KMS encryption key name that you want to use for the IBM Cloud HPC cluster. (for example kms_key_name: my-encryption-key)." } -variable "no_addr_prefix" { +# variable "hpcs_instance_name" { +# type = string +# default = null +# description = "Hyper Protect Crypto Service instance" +# } + +variable "skip_flowlogs_s2s_auth_policy" { type = bool - description = "Set it as true, if you don't want to create address prefixes." + default = false + description = "Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account." +} + +variable "skip_kms_s2s_auth_policy" { + type = bool + default = false + description = "Skip auth policy between KMS service and COS instance, set to true if this policy is already in place on account." } +############################################################################## +# Observability Variables +############################################################################## + variable "observability_logs_enable" { description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management/Compute Nodes will be ingested under COS bucket." type = bool default = false } -variable "skip_flowlogs_s2s_auth_policy" { +variable "enable_vpn" { type = bool default = false - description = "Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account." + description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true." +} +############################################################################## +# Landing Zone Variables +############################################################################## + +variable "clusters" { + default = null + description = "A list describing clusters workloads to create" + type = list( + object({ + name = string # Name of Cluster + vpc_name = string # Name of VPC + subnet_names = list(string) # List of vpc subnets for cluster + workers_per_subnet = number # Worker nodes per subnet. + machine_type = string # Worker node flavor + kube_type = string # iks or openshift + kube_version = optional(string) # Can be a version from `ibmcloud ks versions` or `default` + entitlement = optional(string) # entitlement option for openshift + secondary_storage = optional(string) # Secondary storage type + pod_subnet = optional(string) # Portable subnet for pods + service_subnet = optional(string) # Portable subnet for services + resource_group = string # Resource Group used for cluster + cos_name = optional(string) # Name of COS instance Required only for OpenShift clusters + access_tags = optional(list(string), []) + boot_volume_crk_name = optional(string) # Boot volume encryption key name + disable_public_endpoint = optional(bool, true) # disable cluster public, leaving only private endpoint + disable_outbound_traffic_protection = optional(bool, false) # public outbound access from the cluster workers + cluster_force_delete_storage = optional(bool, false) # force the removal of persistent storage associated with the cluster during cluster deletion + operating_system = string # The operating system of the workers in the default worker pool. See https://cloud.ibm.com/docs/openshift?topic=openshift-openshift_versions#openshift_versions_available . + kms_wait_for_apply = optional(bool, true) # make terraform wait until KMS is applied to master and it is ready and deployed + verify_cluster_network_readiness = optional(bool, true) # Flag to run a script will run kubectl commands to verify that all worker nodes can communicate successfully with the master. If the runtime does not have access to the kube cluster to run kubectl commands, this should be set to false. + use_ibm_cloud_private_api_endpoints = optional(bool, true) # Flag to force all cluster related api calls to use the IBM Cloud private endpoints. + import_default_worker_pool_on_create = optional(bool) # (Advanced users) Whether to handle the default worker pool as a stand-alone ibm_container_vpc_worker_pool resource on cluster creation. Only set to false if you understand the implications of managing the default worker pool as part of the cluster resource. Set to true to import the default worker pool as a separate resource. Set to false to manage the default worker pool as part of the cluster resource. + allow_default_worker_pool_replacement = optional(bool) # (Advanced users) Set to true to allow the module to recreate a default worker pool. Only use in the case where you are getting an error indicating that the default worker pool cannot be replaced on apply. Once the default worker pool is handled as a stand-alone ibm_container_vpc_worker_pool, if you wish to make any change to the default worker pool which requires the re-creation of the default pool set this variable to true + labels = optional(map(string)) # A list of labels that you want to add to the default worker pool. + addons = optional(object({ # Map of OCP cluster add-on versions to install + debug-tool = optional(string) + image-key-synchronizer = optional(string) + openshift-data-foundation = optional(string) + vpc-file-csi-driver = optional(string) + static-route = optional(string) + cluster-autoscaler = optional(string) + vpc-block-csi-driver = optional(string) + ibm-storage-operator = optional(string) + }), {}) + manage_all_addons = optional(bool, false) # Instructs Terraform to manage all cluster addons, even if addons were installed outside of the module. If set to 'true' this module will destroy any addons that were installed by other sources. + kms_config = optional( + object({ + crk_name = string # Name of key + private_endpoint = optional(bool) # Private endpoint + }) + ) + worker_pools = optional( + list( + object({ + name = string # Worker pool name + vpc_name = string # VPC name + workers_per_subnet = number # Worker nodes per subnet + flavor = string # Worker node flavor + subnet_names = list(string) # List of vpc subnets for worker pool + entitlement = optional(string) # entitlement option for openshift + secondary_storage = optional(string) # Secondary storage type + boot_volume_crk_name = optional(string) # Boot volume encryption key name + operating_system = string # The operating system of the workers in the worker pool. See https://cloud.ibm.com/docs/openshift?topic=openshift-openshift_versions#openshift_versions_available . + labels = optional(map(string)) # A list of labels that you want to add to all the worker nodes in the worker pool. + }) + ) + ) + }) + ) } diff --git a/modules/landing_zone/version.tf b/modules/landing_zone/version.tf index 124b3869..a2e0a027 100644 --- a/modules/landing_zone/version.tf +++ b/modules/landing_zone/version.tf @@ -1,9 +1,13 @@ +############################################################################## +# Terraform Providers +############################################################################## + terraform { required_version = ">= 1.9.0" required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" } } } diff --git a/modules/landing_zone_vsi/README.md b/modules/landing_zone_vsi/README.md index 42fbdef5..c5a94631 100644 --- a/modules/landing_zone_vsi/README.md +++ b/modules/landing_zone_vsi/README.md @@ -2,138 +2,105 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.0 | -| [ibm](#requirement\_ibm) | >= 1.56.2 | -| [template](#requirement\_template) | ~> 2 | +| [http](#requirement\_http) | 3.2.1 | +| [ibm](#requirement\_ibm) | >= 1.53.0 | ## Providers | Name | Version | |------|---------| -| [ibm](#provider\_ibm) | >= 1.56.2 | -| [template](#provider\_template) | ~> 2 | +| [http](#provider\_http) | 3.2.1 | +| [ibm](#provider\_ibm) | 1.56.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [compute\_key](#module\_compute\_key) | ./../key | n/a | -| [compute\_sg](#module\_compute\_sg) | terraform-ibm-modules/security-group/ibm | 2.6.2 | -| [compute\_sg\_with\_ldap\_connection](#module\_compute\_sg\_with\_ldap\_connection) | terraform-ibm-modules/security-group/ibm | 2.6.2 | -| [do\_management\_candidate\_vsi\_configuration](#module\_do\_management\_candidate\_vsi\_configuration) | ./../../modules/null/remote_exec_script | n/a | -| [do\_management\_vsi\_configuration](#module\_do\_management\_vsi\_configuration) | ./../../modules/null/remote_exec_script | n/a | -| [generate\_db\_password](#module\_generate\_db\_password) | ../../modules/security/password | n/a | -| [ldap\_vsi](#module\_ldap\_vsi) | terraform-ibm-modules/landing-zone-vsi/ibm | 4.5.0 | -| [login\_vsi](#module\_login\_vsi) | terraform-ibm-modules/landing-zone-vsi/ibm | 4.5.0 | -| [management\_candidate\_vsi](#module\_management\_candidate\_vsi) | terraform-ibm-modules/landing-zone-vsi/ibm | 4.5.0 | -| [management\_vsi](#module\_management\_vsi) | terraform-ibm-modules/landing-zone-vsi/ibm | 4.5.0 | -| [nfs\_storage\_sg](#module\_nfs\_storage\_sg) | terraform-ibm-modules/security-group/ibm | 2.6.2 | -| [ssh\_connection\_to\_login\_node\_via\_cluster\_nodes](#module\_ssh\_connection\_to\_login\_node\_via\_cluster\_nodes) | terraform-ibm-modules/security-group/ibm | 2.6.2 | -| [ssh\_key](#module\_ssh\_key) | ./../key | n/a | -| [wait\_management\_candidate\_vsi\_booted](#module\_wait\_management\_candidate\_vsi\_booted) | ./../../modules/null/remote_exec | n/a | -| [wait\_management\_vsi\_booted](#module\_wait\_management\_vsi\_booted) | ./../../modules/null/remote_exec | n/a | -| [wait\_worker\_vsi\_booted](#module\_wait\_worker\_vsi\_booted) | ./../../modules/null/remote_exec | n/a | -| [worker\_vsi](#module\_worker\_vsi) | terraform-ibm-modules/landing-zone-vsi/ibm | 4.5.0 | +| [landing-zone](#module\_landing-zone) | terraform-ibm-modules/landing-zone/ibm | 4.5.5 | ## Resources | Name | Type | |------|------| +| [http_http.allowed_ip](https://registry.terraform.io/providers/hashicorp/http/3.2.1/docs/data-sources/http) | data source | | [ibm_is_image.compute](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_image) | data source | -| [ibm_is_image.ldap_vsi_image](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_image) | data source | -| [ibm_is_image.login](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_image) | data source | | [ibm_is_image.management](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_image) | data source | -| [ibm_is_instance_profile.management_node](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | -| [ibm_is_instance_profile.worker_node](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | -| [ibm_is_region.region](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_region) | data source | -| [ibm_is_ssh_key.bastion](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_ssh_key) | data source | +| [ibm_is_image.storage](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_image) | data source | +| [ibm_is_instance_profile.compute](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | +| [ibm_is_instance_profile.management](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | +| [ibm_is_instance_profile.protocol](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | +| [ibm_is_instance_profile.storage](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_instance_profile) | data source | +| [ibm_is_region.itself](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_region) | data source | | [ibm_is_ssh_key.compute](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_ssh_key) | data source | -| [template_file.ldap_user_data](https://registry.terraform.io/providers/hashicorp/template/latest/docs/data-sources/file) | data source | -| [template_file.login_user_data](https://registry.terraform.io/providers/hashicorp/template/latest/docs/data-sources/file) | data source | -| [template_file.management_user_data](https://registry.terraform.io/providers/hashicorp/template/latest/docs/data-sources/file) | data source | -| [template_file.management_values](https://registry.terraform.io/providers/hashicorp/template/latest/docs/data-sources/file) | data source | -| [template_file.worker_user_data](https://registry.terraform.io/providers/hashicorp/template/latest/docs/data-sources/file) | data source | +| [ibm_is_ssh_key.client](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_ssh_key) | data source | +| [ibm_is_ssh_key.management](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_ssh_key) | data source | +| [ibm_is_ssh_key.storage](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_ssh_key) | data source | +| [ibm_is_vpc.itself](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_vpc) | data source | +| [ibm_is_zone.itself](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/is_zone) | data source | +| [ibm_resource_group.itself](https://registry.terraform.io/providers/IBM-Cloud/ibm/latest/docs/data-sources/resource_group) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [app\_center\_gui\_pwd](#input\_app\_center\_gui\_pwd) | Password for IBM Spectrum LSF Application Center GUI. Note: Password should be at least 8 characters, must have one number, one lowercase letter, one uppercase letter, and at least one special character. | `string` | `""` | no | -| [app\_center\_high\_availability](#input\_app\_center\_high\_availability) | Set to false to disable the IBM Spectrum LSF Application Center GUI High Availability (default: true) . | `bool` | `true` | no | -| [bastion\_fip](#input\_bastion\_fip) | Bastion FIP. | `string` | n/a | yes | -| [bastion\_instance\_name](#input\_bastion\_instance\_name) | Bastion instance name. | `string` | `null` | no | -| [bastion\_private\_key\_content](#input\_bastion\_private\_key\_content) | Bastion private key content | `string` | n/a | yes | -| [bastion\_public\_key\_content](#input\_bastion\_public\_key\_content) | Bastion security group id. | `string` | `null` | no | -| [bastion\_security\_group\_id](#input\_bastion\_security\_group\_id) | Bastion security group id. | `string` | n/a | yes | -| [bastion\_subnets](#input\_bastion\_subnets) | Subnets to launch the bastion host. |
list(object({
name = string
id = string
zone = string
cidr = string
}))
| `[]` | no | -| [boot\_volume\_encryption\_key](#input\_boot\_volume\_encryption\_key) | CRN of boot volume encryption key | `string` | `null` | no | -| [ce\_project\_guid](#input\_ce\_project\_guid) | The GUID of the Code Engine Project associated to this cluster Reservation | `string` | n/a | yes | -| [cloud\_logs\_ingress\_private\_endpoint](#input\_cloud\_logs\_ingress\_private\_endpoint) | String describing resource groups to create or reference | `string` | `null` | no | -| [cloud\_monitoring\_access\_key](#input\_cloud\_monitoring\_access\_key) | IBM Cloud Monitoring access key for agents to use | `string` | n/a | yes | -| [cloud\_monitoring\_ingestion\_url](#input\_cloud\_monitoring\_ingestion\_url) | IBM Cloud Monitoring ingestion url for agents to use | `string` | n/a | yes | -| [cloud\_monitoring\_prws\_key](#input\_cloud\_monitoring\_prws\_key) | IBM Cloud Monitoring Prometheus Remote Write ingestion key | `string` | n/a | yes | -| [cloud\_monitoring\_prws\_url](#input\_cloud\_monitoring\_prws\_url) | IBM Cloud Monitoring Prometheus Remote Write ingestion url | `string` | n/a | yes | -| [cluster\_id](#input\_cluster\_id) | Ensure that you have received the cluster ID from IBM technical sales. A unique identifer for HPC cluster used by IBM Cloud HPC to differentiate different HPC clusters within the same contract. This can be up to 39 alphanumeric characters including the underscore (\_), the hyphen (-), and the period (.) characters. You cannot change the cluster ID after deployment. | `string` | n/a | yes | -| [cluster\_user](#input\_cluster\_user) | Linux user for cluster administration. | `string` | n/a | yes | -| [compute\_image\_name](#input\_compute\_image\_name) | Image name to use for provisioning the compute cluster instances. | `string` | `"hpcaas-lsf10-rhel810-compute-v8"` | no | -| [compute\_private\_key\_content](#input\_compute\_private\_key\_content) | Compute private key content | `string` | n/a | yes | +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` |
[
"10.0.0.0/8"
]
| no | +| [deployer\_ssh\_keys](#input\_deployer\_ssh\_keys) | The key pair to use to access the deployer host. | `list(string)` | n/a | yes | +| [compute\_dns\_domain](#input\_compute\_dns\_domain) | IBM Cloud DNS domain name to be used for compute cluster. | `string` | `"comp.com"` | no | +| [compute\_dns\_service\_id](#input\_compute\_dns\_service\_id) | IBM Cloud compute cluster DNS service resource id. | `string` | `null` | no | +| [compute\_dns\_zone\_id](#input\_compute\_dns\_zone\_id) | IBM Cloud compute cluster DNS zone id. | `string` | `null` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | n/a | yes | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_image\_name](#input\_compute\_image\_name) | Image name to use for provisioning the compute cluster instances. | `string` | `"ibm-redhat-8-10-minimal-amd64-2"` | no | +| [compute\_profile](#input\_compute\_profile) | Profile to be used for compute cluster virtual server instance. | `string` | `"cx2-2x4"` | no | | [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | n/a | yes | -| [compute\_subnets](#input\_compute\_subnets) | Subnets to launch the compute host. |
list(object({
name = string
id = string
zone = string
cidr = string
crn = string
}))
| `[]` | no | -| [contract\_id](#input\_contract\_id) | Ensure that you have received the contract ID from IBM technical sales. Contract ID is a unique identifier to distinguish different IBM Cloud HPC service agreements. It must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (\_). | `string` | n/a | yes | -| [db\_admin\_password](#input\_db\_admin\_password) | The IBM Cloud Database for MySQL password required to reference the PAC database. | `string` | `null` | no | -| [db\_instance\_info](#input\_db\_instance\_info) | The IBM Cloud Database for MySQL information required to reference the PAC database. |
object({
id = string
admin_user = string
hostname = string
port = number
certificate = string
})
| `null` | no | -| [dedicated\_host\_id](#input\_dedicated\_host\_id) | Dedicated Host for the worker nodes | `string` | `null` | no | -| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
#storage = string
#protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | -| [enable\_app\_center](#input\_enable\_app\_center) | Set to true to enable the IBM Spectrum LSF Application Center GUI (default: false). [System requirements](https://www.ibm.com/docs/en/slac/10.2.0?topic=requirements-system-102-fix-pack-14) for IBM Spectrum LSF Application Center Version 10.2 Fix Pack 14. | `bool` | `false` | no | -| [enable\_dedicated\_host](#input\_enable\_dedicated\_host) | Set this option to true to enable dedicated hosts for the VSI created for workload servers, with the default value set to false. | `bool` | `false` | no | -| [enable\_ldap](#input\_enable\_ldap) | Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false. | `bool` | `false` | no | -| [existing\_kms\_instance\_guid](#input\_existing\_kms\_instance\_guid) | GUID of boot volume encryption key | `string` | `null` | no | -| [file\_share](#input\_file\_share) | VPC file share mount points considering the ip address and the file share name | `list(string)` | n/a | yes | -| [hyperthreading\_enabled](#input\_hyperthreading\_enabled) | Setting this to true will enable hyper-threading in the compute nodes of the cluster (default). Otherwise, hyper-threading will be disabled. | `bool` | `true` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `list(string)` |
[
"10.10.10.0/24",
"10.20.10.0/24",
"10.30.10.0/24"
]
| no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_client](#input\_enable\_client) | The solution supports multiple ways to connect to your HPC cluster for example, using client node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using client node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | `""` | no | | [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | `null` | no | -| [kms\_encryption\_enabled](#input\_kms\_encryption\_enabled) | Enable Key management | `bool` | `true` | no | -| [ldap\_admin\_password](#input\_ldap\_admin\_password) | The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@\_+:) are required. It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]. | `string` | `""` | no | -| [ldap\_basedns](#input\_ldap\_basedns) | The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name. | `string` | `"hpcaas.com"` | no | -| [ldap\_primary\_ip](#input\_ldap\_primary\_ip) | List of LDAP primary IPs. | `list(string)` | n/a | yes | -| [ldap\_server](#input\_ldap\_server) | Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created. | `string` | `"null"` | no | -| [ldap\_server\_cert](#input\_ldap\_server\_cert) | Provide the existing LDAP server certificate. If not provided, the value should be set to 'null'. | `string` | `"null"` | no | -| [ldap\_user\_name](#input\_ldap\_user\_name) | Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server] | `string` | `""` | no | -| [ldap\_user\_password](#input\_ldap\_user\_password) | The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@\_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]. | `string` | `""` | no | -| [ldap\_vsi\_osimage\_name](#input\_ldap\_vsi\_osimage\_name) | Image name to be used for provisioning the LDAP instances. | `string` | `"ibm-ubuntu-22-04-4-minimal-amd64-3"` | no | -| [ldap\_vsi\_profile](#input\_ldap\_vsi\_profile) | Profile to be used for LDAP virtual server instance. | `string` | `"cx2-2x4"` | no | -| [login\_image\_name](#input\_login\_image\_name) | Image name to use for provisioning the login instance. | `string` | `"hpcaas-lsf10-rhel810-compute-v8"` | no | -| [login\_node\_instance\_type](#input\_login\_node\_instance\_type) | Specify the virtual server instance profile type to be used to create the login node for the IBM Cloud HPC cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles). | `string` | `"bx2-2x8"` | no | -| [login\_private\_ips](#input\_login\_private\_ips) | Login private IPs | `string` | n/a | yes | -| [management\_image\_name](#input\_management\_image\_name) | Image name to use for provisioning the management cluster instances. | `string` | n/a | yes | -| [management\_node\_count](#input\_management\_node\_count) | Number of management nodes. This is the total number of management nodes. Enter a value between 1 and 10. | `number` | `3` | no | -| [management\_node\_instance\_type](#input\_management\_node\_instance\_type) | Specify the virtual server instance profile type to be used to create the management nodes for the IBM Cloud HPC cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles). | `string` | `"bx2-16x64"` | no | -| [mount\_path](#input\_mount\_path) | Provide the path for the vpc file share to be mounted on to the HPC Cluster nodes |
list(object({
mount_path = string,
size = optional(number),
iops = optional(number),
nfs_share = optional(string)
}))
| n/a | yes | -| [observability\_logs\_enable\_for\_compute](#input\_observability\_logs\_enable\_for\_compute) | Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested. | `bool` | `false` | no | -| [observability\_logs\_enable\_for\_management](#input\_observability\_logs\_enable\_for\_management) | Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested. | `bool` | `false` | no | -| [observability\_monitoring\_enable](#input\_observability\_monitoring\_enable) | Set true to enable IBM Cloud Monitoring instance provisioning. | `bool` | `false` | no | -| [observability\_monitoring\_on\_compute\_nodes\_enable](#input\_observability\_monitoring\_on\_compute\_nodes\_enable) | Set true to enable IBM Cloud Monitoring on Compute Nodes. | `bool` | `false` | no | +| [key\_management](#input\_key\_management) | null/key\_protect/hs\_crypto | `string` | `null` | no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to access the client host. | `list(string)` | n/a | yes | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `list(string)` |
[
"10.0.0.0/24"
]
| no | +| [management\_image\_name](#input\_management\_image\_name) | Image name to use for provisioning the management cluster instances. | `string` | `"ibm-redhat-8-10-minimal-amd64-2"` | no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. | `number` | `3` | no | +| [management\_profile](#input\_management\_profile) | Profile to be used for management virtual server instance. | `string` | `"cx2-2x4"` | no | +| [max\_compute\_instances](#input\_max\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. | `number` | `250` | no | +| [min\_compute\_instances](#input\_min\_compute\_instances) | Min Number of instances to be launched for compute cluster. | `number` | `0` | no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | | [prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | n/a | yes | +| [protocol\_dns\_domain](#input\_protocol\_dns\_domain) | IBM Cloud DNS domain name to be used for protocol cluster. | `string` | `"ces.com"` | no | +| [protocol\_dns\_service\_id](#input\_protocol\_dns\_service\_id) | IBM Cloud protocol cluster DNS service resource id. | `string` | `null` | no | +| [protocol\_dns\_zone\_id](#input\_protocol\_dns\_zone\_id) | IBM Cloud protocol cluster DNS zone id. | `string` | `null` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. | `number` | `2` | no | +| [protocol\_profile](#input\_protocol\_profile) | Profile to be used for storage cluster virtual server instance. | `string` | `"bx2-2x8"` | no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `list(string)` |
[
"10.10.30.0/24",
"10.20.30.0/24",
"10.30.30.0/24"
]
| no | | [resource\_group](#input\_resource\_group) | String describing resource groups to create or reference | `string` | `null` | no | -| [share\_path](#input\_share\_path) | Provide the exact path to where the VPC file share needs to be mounted | `string` | n/a | yes | -| [solution](#input\_solution) | Provide the value for the solution that is needed for the support of lsf and HPC | `string` | `"lsf"` | no | -| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the host. | `list(string)` | n/a | yes | -| [storage\_security\_group\_id](#input\_storage\_security\_group\_id) | Existing Scale storage security group id | `string` | `null` | no | -| [vpc\_id](#input\_vpc\_id) | ID of an existing VPC in which the cluster resources will be deployed. | `string` | n/a | yes | -| [worker\_node\_instance\_type](#input\_worker\_node\_instance\_type) | The minimum number of worker nodes refers to the static worker nodes provisioned during cluster creation. The solution supports various instance types, so specify the node count based on the requirements of each instance profile. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles). |
list(object({
count = number
instance_type = string
}))
|
[
{
"count": 3,
"instance_type": "bx2-4x16"
},
{
"count": 0,
"instance_type": "cx2-8x16"
}
]
| no | -| [worker\_node\_max\_count](#input\_worker\_node\_max\_count) | The maximum number of worker nodes that can be deployed in the Spectrum LSF cluster. In order to use the [Resource Connector](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=lsf-resource-connnector) feature to dynamically create and delete worker nodes based on workload demand, the value selected for this parameter must be larger than worker\_node\_min\_count. If you plan to deploy only static worker nodes in the LSF cluster, e.g., when using Spectrum Scale storage, the value for this parameter should be equal to worker\_node\_min\_count. Enter a value in the range 1 - 500. | `number` | `10` | no | +| [scheduler](#input\_scheduler) | Select one of the scheduler (LSF/Symphony/Slurm/None) | `string` | `"LSF"` | no | +| [storage\_dns\_domain](#input\_storage\_dns\_domain) | IBM Cloud DNS domain name to be used for storage cluster. | `string` | `"strg.com"` | no | +| [storage\_dns\_service\_id](#input\_storage\_dns\_service\_id) | IBM Cloud storage cluster DNS service resource id. | `string` | `null` | no | +| [storage\_dns\_zone\_id](#input\_storage\_dns\_zone\_id) | IBM Cloud storage cluster DNS zone id. | `string` | `null` | no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | n/a | yes | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_image\_name](#input\_storage\_image\_name) | Image name to use for provisioning the storage cluster instances. | `string` | `"ibm-redhat-8-10-minimal-amd64-2"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. | `number` | `3` | no | +| [storage\_profile](#input\_storage\_profile) | Profile to be used for storage cluster instance. | `string` | `"bx2d-2x8"` | no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | n/a | yes | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `list(string)` |
[
"10.10.20.0/24",
"10.20.20.0/24",
"10.30.20.0/24"
]
| no | +| [storage\_type](#input\_storage\_type) | Select the required storage type(scratch/persistent/eval). | `string` | `"scratch"` | no | +| [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpc\_custom\_resolver\_id](#input\_vpc\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | | [zones](#input\_zones) | Region where VPC will be created. To find your VPC region, use `ibmcloud is regions` command to find available regions. | `list(string)` | n/a | yes | ## Outputs -| Name | Description | -|------|-------------| -| [compute\_private\_key\_content](#output\_compute\_private\_key\_content) | Compute private key content | -| [compute\_public\_key\_content](#output\_compute\_public\_key\_content) | Compute public key content | -| [compute\_sg\_id](#output\_compute\_sg\_id) | Compute SG id | -| [image\_map\_entry\_found](#output\_image\_map\_entry\_found) | Available if the image name provided is located within the image map | -| [ldap\_server](#output\_ldap\_server) | LDAP server IP | -| [ldap\_vsi\_data](#output\_ldap\_vsi\_data) | Login VSI data | -| [login\_vsi\_data](#output\_login\_vsi\_data) | Login VSI data | -| [management\_candidate\_vsi\_data](#output\_management\_candidate\_vsi\_data) | Management candidate VSI data | -| [management\_vsi\_data](#output\_management\_vsi\_data) | Management VSI data | -| [worker\_vsi\_data](#output\_worker\_vsi\_data) | Static worker VSI data | +No outputs. diff --git a/modules/landing_zone_vsi/configuration_steps/.gitignore b/modules/landing_zone_vsi/configuration_steps/.gitignore deleted file mode 100644 index c46693cd..00000000 --- a/modules/landing_zone_vsi/configuration_steps/.gitignore +++ /dev/null @@ -1 +0,0 @@ -management_values diff --git a/modules/landing_zone_vsi/configuration_steps/compute_user_data_fragment.sh b/modules/landing_zone_vsi/configuration_steps/compute_user_data_fragment.sh deleted file mode 100644 index a84735cb..00000000 --- a/modules/landing_zone_vsi/configuration_steps/compute_user_data_fragment.sh +++ /dev/null @@ -1,518 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -if [ "$compute_user_data_vars_ok" != "1" ]; then - echo 2>&1 "fatal: vars block is missing" - exit 1 -fi - -echo "Logging initial env variables" >> $logfile -env|sort >> $logfile - -# Disallow root login -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\"lsfadmin or vpcuser\\\" rather than the user \\\"root\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys - -# Updates the lsfadmin user as never expire -chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin - -# Setup Hostname -HostIP=$(hostname -I | awk '{print $1}') -hostname=${cluster_prefix}-${HostIP//./-} -hostnamectl set-hostname "$hostname" - -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile - -# Setup Network configuration -# Change the MTU setting as this is required for setting mtu as 9000 for communication to happen between clusters -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - # Replace the MTU value in the Netplan configuration - echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - echo "DOMAIN=\"${dns_domain}\"" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - # Change the MTU setting as 9000 at router level. - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) - echo "$cidr_range via $gateway_ip dev ${network_interface} metric 0 mtu 9000" >> /etc/sysconfig/network-scripts/route-eth0 - # Restart the Network Manager. - systemctl restart NetworkManager -elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - net_int=$(basename /sys/class/net/en*) - netplan_config="/etc/netplan/50-cloud-init.yaml" - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) - usermod -s /bin/bash lsfadmin - # Replace the MTU value in the Netplan configuration - if ! grep -qE "^[[:space:]]*mtu: 9000" $netplan_config; then - echo "MTU 9000 Packages entries not found" - # Append the MTU configuration to the Netplan file - sudo sed -i '/'"$net_int"':/a\ mtu: 9000' $netplan_config - sudo sed -i "/dhcp4: true/a \ nameservers:\n search: [$dns_domain]" $netplan_config - sudo sed -i '/'"$net_int"':/a\ routes:\n - to: '"$cidr_range"'\n via: '"$gateway_ip"'\n metric: 100\n mtu: 9000' $netplan_config - sudo netplan apply - echo "MTU set to 9000 on Netplan." - else - echo "MTU entry already exists in Netplan. Skipping." - fi -fi - -# Setup VPC FileShare | NFS Mount -LSF_TOP="/opt/ibm/lsf" -echo "Initiating LSF share mount" >> $logfile - -# Function to attempt NFS mount with retries -mount_nfs_with_retries() { - local server_path=$1 - local client_path=$2 - local retries=5 - local success=false - - rm -rf "${client_path}" - mkdir -p "${client_path}" - - for (( j=0; j> $logfile - if mount | grep -q "${client_path}"; then - echo "Mount successful for ${server_path} on ${client_path}" >> $logfile - success=true - break - else - echo "Attempt $((j+1)) of $retries failed for ${server_path} on ${client_path}" >> $logfile - sleep 2 - fi - done - - if [ "$success" = true ]; then - chmod 777 "${client_path}" - echo "${server_path} ${client_path} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" >> /etc/fstab - else - echo "Mount not found for ${server_path} on ${client_path} after $retries attempts." >> $logfile - rm -rf "${client_path}" - fi - - # Convert success to numeric for return - if [ "$success" = true ]; then - return 0 - else - return 1 - fi -} - -# Setup LSF share -if [ -n "${nfs_server_with_mount_path}" ]; then - echo "File share ${nfs_server_with_mount_path} found" >> $logfile - nfs_client_mount_path="/mnt/lsf" - if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then - # Move stuff to shared fs - for dir in conf work das_staging_area; do - rm -rf "${LSF_TOP}/$dir" - ln -fs "${nfs_client_mount_path}/$dir" "${LSF_TOP}/$dir" - done - chown -R lsfadmin:root "${LSF_TOP}" - else - echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" >> $logfile - exit 1 - fi -fi -echo "Setting LSF share is completed." >> $logfile - -# Setup Custom file shares -echo "Setting custom file shares." >> $logfile -if [ -n "${custom_file_shares}" ]; then - echo "Custom file share ${custom_file_shares} found" >> $logfile - file_share_array=(${custom_file_shares}) - mount_path_array=(${custom_mount_paths}) - length=${#file_share_array[@]} - - for (( i=0; i> $logfile - -# Setup LSF environment variables -LSF_TOP="/opt/ibm/lsf_worker" -LSF_TOP_VERSION=10.1 -LSF_CONF=$LSF_TOP/conf -LSF_CONF_FILE=$LSF_CONF/lsf.conf -LSF_HOSTS_FILE=$LSF_CONF/hosts -. $LSF_CONF/profile.lsf # WARNING: this may unset LSF_TOP and LSF_VERSION -echo "Logging env variables" >> $logfile -env | sort >> $logfile - -# Defining ncpus based on hyper-threading -if [ "$hyperthreading" == true ]; then - ego_define_ncpus="threads" -else - ego_define_ncpus="cores" - cat << 'EOT' > /root/lsf_hyperthreading -#!/bin/sh -for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do - echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" -done -EOT - chmod 755 /root/lsf_hyperthreading - command="/root/lsf_hyperthreading" - sh $command && (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - -fi -echo "EGO_DEFINE_NCPUS=${ego_define_ncpus}" >> $LSF_CONF_FILE - -# Update lsf configuration -echo 'LSB_MC_DISABLE_HOST_LOOKUP=Y' >> $LSF_CONF_FILE -echo "LSF_RSH=\"ssh -o 'PasswordAuthentication no' -o 'StrictHostKeyChecking no'\"" >> $LSF_CONF_FILE -sed -i "s/LSF_SERVER_HOSTS=.*/LSF_SERVER_HOSTS=\"$ManagementHostNames\"/g" $LSF_CONF_FILE - -# TODO: Understand usage -# Support rc_account resource to enable RC_ACCOUNT policy -if [ -n "${rc_account}" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap ${rc_account}*rc_account]\"/" $LSF_CONF_FILE - echo "Update LSF_LOCAL_RESOURCES lsf.conf successfully, add [resourcemap ${rc_account}*rc_account]" >> $logfile -fi -# Support for multiprofiles for the Job submission -if [ -n "${family}" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap ${family}*family]\"/" $LSF_CONF_FILE - echo "update LSF_LOCAL_RESOURCES lsf.conf successfully, add [resourcemap ${pricing}*family]" >> $logfile -fi -# Add additional local resources if needed -instance_id=$(dmidecode | grep Family | cut -d ' ' -f 2 |head -1) -if [ -n "$instance_id" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap $instance_id*instanceID]\"/" $LSF_CONF_FILE - echo "Update LSF_LOCAL_RESOURCES in $LSF_CONF_FILE successfully, add [resourcemap ${instance_id}*instanceID]" >> $logfile -else - echo "Can not get instance ID" >> $logfile -fi - -#Update LSF Tuning on dynamic hosts -LSF_TUNABLES="etc/sysctl.conf" -echo 'vm.overcommit_memory=1' >> $LSF_TUNABLES -echo 'net.core.rmem_max=26214400' >> $LSF_TUNABLES -echo 'net.core.rmem_default=26214400' >> $LSF_TUNABLES -echo 'net.core.wmem_max=26214400' >> $LSF_TUNABLES -echo 'net.core.wmem_default=26214400' >> $LSF_TUNABLES -echo 'net.ipv4.tcp_fin_timeout = 5' >> $LSF_TUNABLES -echo 'net.core.somaxconn = 8000' >> $LSF_TUNABLES -sudo sysctl -p $LSF_TUNABLES - -# Setup ssh -lsfadmin_home_dir="/home/lsfadmin" -lsfadmin_ssh_dir="${lsfadmin_home_dir}/.ssh" -mkdir -p $lsfadmin_ssh_dir -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - cp /home/vpcuser/.ssh/authorized_keys $lsfadmin_ssh_dir/authorized_keys -else - cp /home/ubuntu/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" - sudo cp /home/ubuntu/.profile $lsfadmin_home_dir -fi -echo "${lsf_public_key}" >> $lsfadmin_ssh_dir/authorized_keys -echo "StrictHostKeyChecking no" >> $lsfadmin_ssh_dir/config -chmod 600 $lsfadmin_ssh_dir/authorized_keys -chmod 700 $lsfadmin_ssh_dir -chown -R lsfadmin:lsfadmin $lsfadmin_ssh_dir -echo "SSH key setup for lsfadmin user is completed" >> $logfile -echo "source ${LSF_CONF}/profile.lsf" >> $lsfadmin_home_dir/.bashrc -echo "source /opt/intel/oneapi/setvars.sh >> /dev/null" >> $lsfadmin_home_dir/.bashrc -echo "Setting up LSF env variables for lasfadmin user is completed" >> $logfile - -# Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -echo 'LSF_STARTUP_USERS="lsfadmin"' | sudo tee -a /etc/lsf1.sudoers -echo "LSF_STARTUP_PATH=$LSF_TOP_VERSION/linux3.10-glibc2.17-x86_64/etc/" | sudo tee -a /etc/lsf.sudoers -chmod 600 /etc/lsf.sudoers -ls -l /etc/lsf.sudoers - -# Change LSF_CONF= value in lsf_daemons -cd /opt/ibm/lsf_worker/10.1/linux3.10-glibc2.17-x86_64/etc/ -sed -i "s|/opt/ibm/lsf/|/opt/ibm/lsf_worker/|g" lsf_daemons -cd - - -sudo ${LSF_TOP}/10.1/install/hostsetup --top="${LSF_TOP}" --setuid ### WARNING: LSF_TOP may be unset here -echo "Added LSF administrators to start LSF daemons" >> $logfile - -# Install LSF as a service and start up -/opt/ibm/lsf_worker/10.1/install/hostsetup --top="/opt/ibm/lsf_worker" --boot="y" --start="y" --dynamic 2>&1 >> $logfile -cat /opt/ibm/lsf/conf/hosts >> /etc/hosts - -# Setting up the LDAP configuration -if [ "$enable_ldap" = "true" ]; then - - # Detect if the operating system is RHEL or Rocky Linux - if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release || grep -q "NAME=\"Rocky Linux\"" /etc/os-release; then - - # Detect RHEL or Rocky version - version=$(grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}') - - # Proceed if the detected version is either 8 or 9 - if [ "$version" == "8" ] || [ "$version" == "9" ]; then - echo "Detected as RHEL or Rocky $version. Proceeding with LDAP client configuration..." >> $logfile - - # Enable password authentication for SSH by modifying the configuration file - sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - systemctl restart sshd - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries - for attempt in {1..5}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - mkdir -p /etc/openldap/certs/ - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/openldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 5 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 5 attempts. Exiting." >> $logfile; exit 1; } - - - # Create and configure the SSSD configuration file for LDAP integration - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/openldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - chmod 600 /etc/sssd/sssd.conf - chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS - cat < /etc/openldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/openldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/openldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/openldap/certs - - # Apply the SSSD and home directory creation configuration using authselect - authselect select sssd with-mkhomedir --force - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - systemctl enable --now sssd oddjobd - - # Restart both services to apply the configuration - systemctl restart sssd oddjobd - - # Validate the LDAP configuration by performing a test search using ldapsearch - if ldapsearch -x -H ldap://"${ldap_server_ip}"/ -b "dc=${base_dn%%.*},dc=${base_dn#*.}" > /dev/null; then - echo "LDAP configuration completed successfully!" >> $logfile - else - echo "LDAP configuration failed! Exiting." >> $logfile - exit 1 - fi - - # Ensure LSF commands are available to all users by adding the profile to bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bashrc - source /etc/bashrc - - else - echo "This script is intended for RHEL and Rocky Linux 8 or 9. Detected version: $version. Exiting." >> $logfile - exit 1 - fi - - # Detect if the operating system is Ubuntu - elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - # Log detected OS - echo "Detected as Ubuntu. Proceeding with LDAP client configuration..." >> $logfile - - # Allow password authentication for SSH in two configuration files, then restart the SSH service - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config.d/50-cloudimg-settings.conf - sudo systemctl restart ssh - - # Add configuration for automatic home directory creation to the PAM session configuration file - sudo sed -i '$ i\session required pam_mkhomedir.so skel=/etc/skel umask=0022\' /etc/pam.d/common-session - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries - for attempt in {1..5}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - mkdir -p /etc/ldap/certs/ - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/ldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 5 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 5 attempts. Exiting." >> $logfile; exit 1; } - - # Create and configure the SSSD configuration file for LDAP integration on Ubuntu - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/ldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - sudo chmod 600 /etc/sssd/sssd.conf - sudo chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS on Ubuntu - cat < /etc/ldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/ldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/ldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/ldap/certs - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - sudo systemctl enable --now sssd oddjobd && sudo systemctl restart sssd oddjobd - - # Ensure LSF commands are available to all users by adding the profile to bash.bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bash.bashrc - source /etc/bash.bashrc - - # Validate the LDAP configuration by checking the status of the SSSD service - if sudo systemctl is-active --quiet sssd; then - echo "LDAP client configuration completed successfully!" >> $logfile - else - echo "LDAP client configuration failed! Exiting." >> $logfile - exit 1 - fi - - else - echo "This script is designed for RHEL, Rocky Linux, or Ubuntu. Unsupported OS detected. Exiting." >> $logfile - exit 1 - fi -fi - -#update lsf client ip address to LSF_HOSTS_FILE -echo "$login_ip_address $login_hostname" >> $LSF_HOSTS_FILE -# Startup lsf daemons -systemctl status lsfd >> "$logfile" - -# Setting up the Metrics Agent - -if [ "$cloud_monitoring_access_key" != "" ] && [ "$cloud_monitoring_ingestion_url" != "" ]; then - - SYSDIG_CONFIG_FILE="/opt/draios/etc/dragent.yaml" - - #packages installation - echo "Writing sysdig config file" >> "$logfile" - - #sysdig config file - echo "Setting customerid access key" >> "$logfile" - sed -i "s/==ACCESSKEY==/$cloud_monitoring_access_key/g" $SYSDIG_CONFIG_FILE - sed -i "s/==COLLECTOR==/$cloud_monitoring_ingestion_url/g" $SYSDIG_CONFIG_FILE - echo "tags: type:compute,lsf:true" >> $SYSDIG_CONFIG_FILE -else - echo "Skipping metrics agent configuration due to missing parameters" >> "$logfile" -fi - -if [ "$observability_monitoring_on_compute_nodes_enable" = true ]; then - - echo "Restarting sysdig agent" >> "$logfile" - systemctl enable dragent - systemctl restart dragent - else - echo "Metrics agent start skipped since monitoring provisioning is not enabled" >> "$logfile" -fi - -# Setting up the IBM Cloud Logs -if [ "$observability_logs_enable_for_compute" = true ]; then - - echo "Configuring cloud logs for compute since observability logs for compute is enabled" - sudo cp /root/post-config.sh /opt/ibm - cd /opt/ibm - - cat < /etc/fluent-bit/fluent-bit.conf -[SERVICE] - Flush 1 - Log_Level info - Daemon off - Parsers_File parsers.conf - Plugins_File plugins.conf - HTTP_Server On - HTTP_Listen 0.0.0.0 - HTTP_Port 9001 - Health_Check On - HC_Errors_Count 1 - HC_Retry_Failure_Count 1 - HC_Period 30 - storage.path /fluent-bit/cache - storage.max_chunks_up 192 - storage.metrics On - -[INPUT] - Name syslog - Path /tmp/in_syslog - Buffer_Chunk_Size 32000 - Buffer_Max_Size 64000 - Receive_Buffer_Size 512000 - -[INPUT] - Name tail - Tag * - Path /opt/ibm/lsf_worker/log/*.log - Path_Key file - Exclude_Path /var/log/at/** - DB /opt/ibm/lsf_worker/log/fluent-bit.DB - Buffer_Chunk_Size 32KB - Buffer_Max_Size 256KB - Skip_Long_Lines On - Refresh_Interval 10 - storage.type filesystem - storage.pause_on_chunks_overlimit on - -[FILTER] - Name modify - Match * - Add subsystemName compute - Add applicationName lsf - -@INCLUDE output-logs-router-agent.conf -EOL - - sudo chmod +x post-config.sh - sudo ./post-config.sh -h $cloud_logs_ingress_private_endpoint -p "3443" -t "/logs/v1/singles" -a IAMAPIKey -k $VPC_APIKEY_VALUE --send-directly-to-icl -s true -i Production - sudo echo "2024-10-16T14:31:16+0000 INFO Testing IBM Cloud LSF Logs from compute: $hostname" >> /opt/ibm/lsf_worker/log/test.log - sudo logger -u /tmp/in_syslog my_ident my_syslog_test_message_from_compute:$hostname - -else - echo "Cloud Logs configuration skipped since observability logs for compute is not enabled" -fi - -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> "$logfile" diff --git a/modules/landing_zone_vsi/configuration_steps/configure_management_vsi.sh b/modules/landing_zone_vsi/configuration_steps/configure_management_vsi.sh deleted file mode 100644 index b71f6047..00000000 --- a/modules/landing_zone_vsi/configuration_steps/configure_management_vsi.sh +++ /dev/null @@ -1,1338 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -#set -x # uncomment with care: this can log too much, including credentials - -# Setup logs -logfile="/tmp/configure_management.log" -exec > >(stdbuf -oL awk '{print strftime("%Y-%m-%dT%H:%M:%S") " " $0}' | tee "$logfile") 2>&1 -# automatic logging of stdout and stderr, including timestamps; no need to redirect explicitly - -echo "START $(date '+%Y-%m-%d %H:%M:%S')" - -source management_values - -# Local variable declaration -default_cluster_name="HPCCluster" -nfs_server_with_mount_path=${mount_path} -enable_ldap="${enable_ldap}" -ldap_server_ip="${ldap_server_ip}" -ldap_server_cert="${ldap_server_cert}" -base_dn="${ldap_basedns}" - -this_hostname="$(hostname)" -mgmt_hostname_primary="$management_hostname" -mgmt_hostnames="${management_hostname},${management_cand_hostnames}" -mgmt_hostnames="${mgmt_hostnames//,/ }" # replace commas with spaces -mgmt_hostnames="${mgmt_hostnames# }" # remove an initial space -mgmt_hostnames="${mgmt_hostnames% }" # remove a final space - -LSF_TOP="/opt/ibm/lsf" -LSF_CONF="$LSF_TOP/conf" -LSF_SSH="$LSF_TOP/ssh" -LSF_CONF_FILE="$LSF_CONF/lsf.conf" -LSF_HOSTS_FILE="$LSF_CONF/hosts" -LSF_EGO_CONF_FILE="$LSF_CONF/ego/$cluster_name/kernel/ego.conf" -LSF_LSBATCH_CONF="$LSF_CONF/lsbatch/$cluster_name/configdir" -LSF_RC_CONF="$LSF_CONF/resource_connector" -LSF_RC_IC_CONF="$LSF_RC_CONF/ibmcloudgen2/conf" -LSF_DM_STAGING_AREA="$LSF_TOP/das_staging_area" -# Should be changed in the upcoming days. Since the LSF core team have mismatched the path and we have approached to make the changes. -LSF_RC_IBMCLOUDHPC_CONF="$LSF_RC_CONF/ibmcloudhpc/conf" -LSF_TOP_VERSION="$LSF_TOP/10.1" - -# Useful variables that reference the main GUI and PERF Manager folders. -LSF_SUITE_TOP="/opt/ibm/lsfsuite" -LSF_SUITE_GUI="${LSF_SUITE_TOP}/ext/gui" -LSF_SUITE_GUI_CONF="${LSF_SUITE_GUI}/conf" -LSF_SUITE_GUI_WORK="${LSF_SUITE_GUI}/work" -LSF_SUITE_PERF="${LSF_SUITE_TOP}/ext/perf" -LSF_SUITE_PERF_CONF="${LSF_SUITE_PERF}/conf" -LSF_SUITE_PERF_BIN="${LSF_SUITE_PERF}/1.2/bin" - -# important: is this a primary or secondary management node? -if [ "$this_hostname" == "$mgmt_hostname_primary" ]; then - on_primary="true" -else - on_primary="false" -fi -echo "is this node primary: $on_primary" - -echo "umask=$(umask)" -umask 022 # since being run with 077 can cause issues -echo "umask=$(umask)" - -db_certificate_file="${LSF_SUITE_GUI_CONF}/cert.pem" - -# Function that dump the ICD certificate in the $db_certificate_file -create_certificate() { - # Dump the CA certificate in the ${db_certificate_file} file and set permissions - echo "${db_certificate}" | base64 -d > "${db_certificate_file}" - chown lsfadmin:lsfadmin "${db_certificate_file}" - chmod 644 "${db_certificate_file}" - -} - -# Function for creating PAC database in the IBM Cloud Database (ICD) service when High Availability is enabled. -# It is invoked when ${enable_app_center} and ${app_center_high_availability} are both true. -create_appcenter_database() { - # Required SQL commands to create the PAC database in the IBM Cloud Database (ICD) instance. - local create_db_command="CREATE DATABASE ${db_name} default character set utf8 default collate utf8_bin;" - local commands=( - "CREATE USER ${db_user}@'%' IDENTIFIED WITH mysql_native_password BY '${db_password}';" - "CREATE USER ${db_user}@'localhost' IDENTIFIED WITH mysql_native_password BY '${db_password}';" - "GRANT ALL ON ${db_name}.* TO ${db_user}@'%';" - "GRANT ALL ON ${db_name}.* TO ${db_user}@'localhost';" - "source ${LSF_SUITE_PERF}/ego/1.2/DBschema/MySQL/egodata.sql;" - "source ${LSF_SUITE_PERF}/lsf/10.0/DBschema/MySQL/lsfdata.sql;" - "source ${LSF_SUITE_PERF}/lsf/10.0/DBschema/MySQL/lsf_sql.sql;" - "source ${LSF_SUITE_GUI}/DBschema/MySQL/create_schema.sql;" - "source ${LSF_SUITE_GUI}/DBschema/MySQL/create_pac_schema.sql;" - "source ${LSF_SUITE_GUI}/DBschema/MySQL/init.sql;" - ) - - # On ICD you cannot change system variables so we need to comment 736 line in $LSF_SUITE_GUI/DBschema/MySQL/create_pac_schema.sql - sed -i "s|SET GLOBAL group_concat_max_len = 1000000;|/* SET GLOBAL group_concat_max_len = 1000000; */|" $LSF_SUITE_GUI/DBschema/MySQL/create_pac_schema.sql - # Create the PAC database - echo "${create_db_command}" | MYSQL_PWD="${db_adminpassword}" mysql --host="${db_hostname}" --port="${db_port}" --user="${db_adminuser}" --ssl-ca="${db_certificate_file}" ibmclouddb - # Create the pacuser, grant him all the required privileges, then create the schema and tables - for command in "${commands[@]}"; do - echo "${command}" | MYSQL_PWD="${db_adminpassword}" mysql --host="${db_hostname}" --port="${db_port}" --user="${db_adminuser}" --ssl-ca="${db_certificate_file}" pac - done -} - -# Configures the GUI JDBC datasource file ${LSF_SUITE_PERF_CONF}/datasource.xml -# to reference the IBM Cloud Database (ICD) instance. If ${enable_app_center} and -# ${app_center_high_availability} are both true, updates the connection string to -# point to the remote database service instead of the local MySQL server. -configure_icd_datasource() { - local default_connection_string="jdbc:mariadb://localhost:3306/pac?useUnicode=true&characterEncoding=UTF-8&serverTimezone=GMT" - local icd_connection_string="jdbc:mariadb://${db_hostname}:${db_port}/${db_name}?useUnicode=true\&characterEncoding=UTF-8\&serverTimezone=GMT\&requireSSL=true\&useSSL=true\&serverSslCert=${db_certificate_file}" - - # Change the connection string to use ICD - sed -i "s!Connection=\"${default_connection_string}\"!Connection=\"${icd_connection_string}\"!" ${LSF_SUITE_PERF_CONF}/datasource.xml - # Change the Cipher algorithm to AES128 in the Datasource definition - sed -i "s|Cipher=\".*\"|Cipher=\"aes128\"|" ${LSF_SUITE_PERF_CONF}/datasource.xml - # Encrypt the Database user and password with AES128 Cipher. The encryptTool.sh script requires the setting of the JAVA_HOME - db_user_aes128=$(source ${LSF_SUITE_TOP}/ext/profile.platform; ${LSF_SUITE_PERF_BIN}/encryptTool.sh "${db_user}") - db_password_aes128=$(source ${LSF_SUITE_TOP}/ext/profile.platform; ${LSF_SUITE_PERF_BIN}/encryptTool.sh "${db_password}") - # Change the username password in the Datasource definition - sed -i "s|UserName=\".*\"|UserName=\"${db_user_aes128}\"|" ${LSF_SUITE_PERF_CONF}/datasource.xml - sed -i "s|Password=\".*\"|Password=\"${db_password_aes128}\"|" ${LSF_SUITE_PERF_CONF}/datasource.xml -} - -########### LSFSETUP-BEGIN ################################################################ -###################################### search LSFSETUP-END to skip this part ############## - -# Setup LSF - -if [ "$on_primary" == "true" ]; then - - echo "LSF configuration begin" - - mkdir -p $LSF_RC_IBMCLOUDHPC_CONF - chown -R lsfadmin:root $LSF_RC_IBMCLOUDHPC_CONF - - echo "Setting up LSF" - - # 0. Update LSF configuration with new cluster name if cluster_name is not default - if [ "$default_cluster_name" != "$cluster_name" ]; then - echo "New cluster name $cluster_name has been identified. Upgrading the cluster configurations accordingly." - grep -rli "$default_cluster_name" $LSF_CONF/* | xargs sed -i "s/$default_cluster_name/$cluster_name/g" - # Below directory in work has cluster_name twice in path and was resulting in a indefinite loop scenario. So, this directory has to be handled separately - mv $LSF_TOP/work/$default_cluster_name/live_confdir/lsbatch/$default_cluster_name $LSF_TOP/work/"$cluster_name"/live_confdir/lsbatch/"$cluster_name" - for file in $(find $LSF_TOP -name "*$default_cluster_name*"); do mv "$file" $(echo "$file"| sed -r "s/$default_cluster_name/$cluster_name/g"); done - fi - -if [ "$solution" = "lsf" ]; then - LSB_RC_EXTERNAL_HOST_FLAG="icgen2host" -elif [ "$solution" = "hpc" ]; then - LSB_RC_EXTERNAL_HOST_FLAG="icgen2host cloudhpchost" -fi - - # 1. setting up lsf configuration - cat <> $LSF_CONF_FILE -LSB_RC_EXTERNAL_HOST_IDLE_TIME=10 -LSF_DYNAMIC_HOST_TIMEOUT="EXPIRY[10m] THRESHOLD[250] INTERVAL[60m]" -LSB_RC_EXTERNAL_HOST_FLAG="$LSB_RC_EXTERNAL_HOST_FLAG" -LSB_RC_UPDATE_INTERVAL=15 -LSB_RC_MAX_NEWDEMAND=50 -LSF_UDP_TO_TCP_THRESHOLD=9000 -LSF_CALL_LIM_WITH_TCP=N -LSF_ANNOUNCE_MASTER_TCP_WAITTIME=600 -LSF_CLOUD_UI=Y -LSF_RSH="ssh -o 'PasswordAuthentication no' -o 'StrictHostKeyChecking no'" -EOT - sed -i "s/LSF_MASTER_LIST=.*/LSF_MASTER_LIST=\"${mgmt_hostnames}\"/g" $LSF_CONF_FILE - - if [ "$hyperthreading" == true ]; then - ego_define_ncpus="threads" - else - ego_define_ncpus="cores" - - cat << 'EOT' > /root/lsf_hyperthreading -#!/bin/sh -for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do - echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" -done -EOT - chmod 755 /root/lsf_hyperthreading - command="/root/lsf_hyperthreading" - sh $command && (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - - fi - echo "EGO_DEFINE_NCPUS=${ego_define_ncpus}" >> $LSF_CONF_FILE - - # 2. setting up lsf.shared - sed -i "s/^# icgen2host/ icgen2host/g" $LSF_CONF/lsf.shared - sed -i '/^End Resource/i cloudhpchost Boolean () () (hpc hosts from IBM Cloud HPC pool)' $LSF_CONF/lsf.shared - sed -i '/^End Resource/i family String () () (account name for the external hosts)' $LSF_CONF/lsf.shared - - # 3. setting up lsb.module - sed -i "s/^#schmod_demand/schmod_demand/g" "$LSF_LSBATCH_CONF/lsb.modules" - - # 4. setting up lsb.queue - sed -i '/^Begin Queue$/,/^End Queue$/{/QUEUE_NAME/{N;s/\(QUEUE_NAME\s*=[^\n]*\)\n/\1\nRC_HOSTS = all\n/}}' "$LSF_LSBATCH_CONF/lsb.queues" - cat <> "$LSF_LSBATCH_CONF/lsb.queues" -Begin Queue -QUEUE_NAME=das_q -DATA_TRANSFER=Y -RC_HOSTS=all -HOSTS=all -RES_REQ=type==any -End Queue -EOT - - # 5. setting up lsb.hosts - for hostname in $mgmt_hostnames; do - sed -i "/^default !.*/a $hostname 0 () () () () () (Y)" "$LSF_LSBATCH_CONF/lsb.hosts" - done - - # 6. setting up lsf.cluster."$cluster_name" - sed -i "s/^lsfservers/#lsfservers/g" "$LSF_CONF/lsf.cluster.$cluster_name" - sed -i 's/LSF_HOST_ADDR_RANGE=\*.\*.\*.\*/LSF_HOST_ADDR_RANGE=10.*.*.*/' "$LSF_CONF/lsf.cluster.$cluster_name" - for hostname in $mgmt_hostnames; do - sed -i "/^#lsfservers.*/a $hostname ! ! 1 (mg)" "$LSF_CONF/lsf.cluster.$cluster_name" - done - - # Updating the value of login node as Intel for lsfserver to update cluster file name - sed -i "/^#lsfservers.*/a $login_hostname Intel_E5 X86_64 0 ()" "$LSF_CONF/lsf.cluster.$cluster_name" - echo "LSF_SERVER_HOSTS=\"$mgmt_hostnames\"" >> $LSF_CONF_FILE - - # Update ego.conf - sed -i "s/EGO_MASTER_LIST=.*/EGO_MASTER_LIST=\"${mgmt_hostnames}\"/g" "$LSF_EGO_CONF_FILE" - # 0.5 Update lsfservers with newly added lsf management nodes - grep -rli 'lsfservers' $LSF_CONF/*|xargs sed -i "s/lsfservers/${this_hostname}/g" - - # Setup LSF resource connector - echo "Setting up LSF resource connector" - - # 1. Create hostProviders.json - if [ "$solution" = "hpc" ] ; then - cat < "$LSF_RC_CONF"/hostProviders.json -{ - "providers":[ - { - "name": "ibmcloudhpc", - "type": "ibmcloudhpcProv", - "confPath": "resource_connector/ibmcloudhpc", - "scriptPath": "resource_connector/ibmcloudhpc" - } - ] -} -EOT - else - cat < "$LSF_RC_CONF"/hostProviders.json -{ - "providers":[ - { - "name": "ibmcloudgen2", - "type": "ibmcloudgen2Prov", - "confPath": "resource_connector/ibmcloudgen2", - "scriptPath": "resource_connector/ibmcloudgen2" - } - ] -} -EOT - fi - - # 2. Create ibmcloudgen2_config.json - cat < "$LSF_RC_IC_CONF"/ibmcloudgen2_config.json -{ - "IBMCLOUDGEN2_KEY_FILE": "${LSF_RC_IC_CONF}/credentials", - "IBMCLOUDGEN2_PROVISION_FILE": "${LSF_RC_IC_CONF}/user_data.sh", - "IBMCLOUDGEN2_MACHINE_PREFIX": "${cluster_prefix}", - "LogLevel": "INFO", - "ApiEndPoints": { - "eu-gb": "https://eu-gb.iaas.cloud.ibm.com/v1", - "au-syd": "https://au-syd.iaas.cloud.ibm.com/v1", - "ca-tor": "https://ca-tor.iaas.cloud.ibm.com/v1", - "jp-osa": "https://jp-osa.iaas.cloud.ibm.com/v1", - "jp-tok": "https://jp-tok.iaas.cloud.ibm.com/v1", - "br-sao": "https://br-sao.iaas.cloud.ibm.com/v1", - "us-south": "https://us-south.iaas.cloud.ibm.com/v1", - "eu-de": "https://eu-de.iaas.cloud.ibm.com/v1", - "us-east": "https://us-east.iaas.cloud.ibm.com/v1" - } -} -EOT - - # 3. Create ibmcloudhpc_config.json - cat < "$LSF_RC_IBMCLOUDHPC_CONF"/ibmcloudhpc_config.json -{ - "IBMCLOUDHPC_KEY_FILE": "${LSF_RC_IBMCLOUDHPC_CONF}/credentials", - "IBMCLOUDHPC_PROVISION_FILE": "${LSF_RC_IBMCLOUDHPC_CONF}/user_data.sh", - "IBMCLOUDHPC_MACHINE_PREFIX": "${cluster_prefix}", - "LogLevel": "INFO", - "CONTRACT_ID": "${contract_id}", - "CLUSTER_ID": "${cluster_name}", - "PROJECT_ID": "${ce_project_guid}", - "ApiEndPoints": { - "us-east": "${api_endpoint_us_east}", - "eu-de": "${api_endpoint_eu_de}", - "us-south": "${api_endpoint_us_south}" - } -} -EOT - - # 4. Create credentials for ibmcloudgen2 - cat < "$LSF_RC_IC_CONF"/credentials -# BEGIN ANSIBLE MANAGED BLOCK -VPC_URL=http://vpc.cloud.ibm.com/v1 -VPC_AUTH_TYPE=iam -VPC_APIKEY=$VPC_APIKEY_VALUE -RESOURCE_RECORDS_URL=https://api.dns-svcs.cloud.ibm.com/v1 -RESOURCE_RECORDS_AUTH_TYPE=iam -RESOURCE_RECORDS_APIKEY=$VPC_APIKEY_VALUE -EOT - - # 5. Create credentials for ibmcloudhpc - cat < "$LSF_RC_IBMCLOUDHPC_CONF"/credentials -# BEGIN ANSIBLE MANAGED BLOCK -CLOUD_HPC_URL=http://vpc.cloud.ibm.com/v1 -CLOUD_HPC_AUTH_TYPE=iam -CLOUD_HPC_AUTH_URL=https://iam.cloud.ibm.com -CLOUD_HPC_APIKEY=$VPC_APIKEY_VALUE -RESOURCE_RECORDS_URL=https://api.dns-svcs.cloud.ibm.com/v1 -RESOURCE_RECORDS_AUTH_TYPE=iam -RESOURCE_RECORDS_APIKEY=$VPC_APIKEY_VALUE -# END ANSIBLE MANAGED BLOCK -EOT - -cat < "$LSF_RC_IC_CONF"/ibmcloudgen2_templates.json -{ - "templates": [ - { - "templateId": "Template-${cluster_prefix}-1", - "maxNumber": "$rc_max_num", - "attributes": { - "type": ["String", "X86_64"], - "ncores": ["Numeric", "${rc_ncores}"], - "ncpus": ["Numeric", "${rc_ncpus}"], - "mem": ["Numeric", "${rc_memInMB}"], - "icgen2host": ["Boolean", "1"] - }, - "crn": "${bootdrive_crn}", - "imageId": "$imageID", - "subnetId": "$subnetId", - "vpcId": "$vpcID", - "vmType": "${rc_profile}", - "securityGroupIds": ["${securityGroupID}"], - "resourceGroupId": "$rc_rg", - "sshkey_id": "$sshkey_ID", - "region": "$regionName", - "zone": "$zone" - } - ] -} -EOT - - -#cat < "$LSF_RC_IC_CONF"/ibmcloudgen2_templates.json -#{ -# templates = [ -# for worker in var.worker_node_instance_type : { -# templateId = "Template-${var.cluster_prefix}-${worker.instance_type}" -# maxNumber = var.rc_max_num -# attributes = { -# type = ["String", "X86_64"], -# ncores = ["Numeric", worker.count / 2], -# ncpus = ["Numeric", var.hyperthreading ? worker.count : worker.count / 2], -# mem = ["Numeric", floor((var.hyperthreading ? worker.count : worker.count / 2) * 16 * 1024 * 0.9)], -# icgen2host = ["Boolean", true] -# }, -# "crn": "${bootdrive_crn}", -# "imageId": "$imageID", -# "subnetId": "$subnetId", -# "vpcId": "$vpcID" -# "vmType": "${rc_profile}" -# "securityGroupIds": ["${securityGroupID}"], -# "resourceGroupId": "$rc_rg", -# "sshkey_id": "$sshkey_ID", -# "region": "$regionName", -# "zone": "$zone", -# "vmType": worker.instance_type -# } -# ] -#} -# -#EOT -# # 6. Create ibmcloudgen2_templates.json -# ibmcloudgen2_templates="$LSF_RC_IC_CONF/ibmcloudgen2_templates.json" -# # Incrementally build a json string -# json_string="" -# -# tab="$(cat < "$ibmcloudgen2_templates" -# echo "JSON templates are created and updated on ibmcloudgen2_templates.json" - -# 7. Create resource template for ibmcloudhpc templates -# Define the output JSON file path - -ibmcloudhpc_templates="$LSF_RC_IBMCLOUDHPC_CONF/ibmcloudhpc_templates.json" - -# Initialize an empty JSON string -json_string="" - -# Loop through the specified regions -for region in "eu-de" "us-east" "us-south"; do - if [ "$region" = "$regionName" ]; then - # Loop through the core counts - for i in 2 4 8 16 32 48 64 96 128 176; do - if [ "$i" -gt 128 ] && [ "$region" != "us-south" ]; then - # Skip creating templates with more than 128 cores for non us-south regions - continue - fi - - ncores=$((i / 2)) - if [ "$region" = "eu-de" ] || [ "$region" = "us-east" ]; then - family="mx2" - maxmem_mx2=$((ncores * 16 * 1024)) - mem_mx2=$((maxmem_mx2 * 9 / 10)) - elif [ "$region" = "us-south" ]; then - family="mx2,mx3d" # Include both "mx2" and "mx3d" families - maxmem_mx2=$((ncores * 16 * 1024)) - mem_mx2=$((maxmem_mx2 * 9 / 10)) - maxmem_mx3d=$((ncores * 20 * 1024)) - mem_mx3d=$((maxmem_mx3d * 9 / 10)) - fi - - vpcus=$i - - if $hyperthreading; then - ncpus=$vpcus - else - ncpus=$ncores - fi - - if [ "${imageID:0:4}" == "crn:" ]; then - imagetype="imageCrn" - else - imagetype="imageId" - fi - - # Split the family string into an array and iterate over it - IFS=',' read -ra families <<< "$family" - for fam in "${families[@]}"; do - # Check if the core count is valid for the family - if [ "$fam" = "mx2" ] && [ "$i" -gt 128 ]; then - continue - fi - - templateId="Template-${cluster_prefix}-$((1000+i))-$fam" # Add family to templateId - if [ "$fam" = "mx2" ]; then - maxmem_val="$maxmem_mx2" # Use mx2 specific maxmem value - mem_val="$mem_mx2" # Use mx2 specific mem value - priority=10 # Priority for mx2 - elif [ "$fam" = "mx3d" ]; then - maxmem_val="$maxmem_mx3d" # Use mx3d specific maxmem value - mem_val="$mem_mx3d" # Use mx3d specific mem value - priority=20 # Priority for mx3d in us-south - fi - - # Construct JSON object and append it to the JSON string - json_string+=$(cat < "$ibmcloudhpc_templates" -echo "JSON templates are created and updated in ibmcloudhpc_templates.json" - -# 8. Define the directory to store fleet configuration files -fleet_config_dir="$LSF_RC_IBMCLOUDHPC_CONF" -# Loop through regions -for region in "eu-de" "us-east" "us-south"; do - # Define the fleet configuration family based on the region - if [ "$regionName" = "us-south" ]; then - families=("mx2" "mx3d") - else - families=("mx2") - fi - - # Loop through families - for family in "${families[@]}"; do - # Create fleet configuration file for the region and family - cat < "${fleet_config_dir}/ibmcloudhpc_fleetconfig_${family}.json" -{ - "fleet_request": { - "availability_policy": { - "host_failure": "restart" - }, - "host_name": { - "prefix": "${cluster_prefix}", - "domain": "${dns_domain}" - }, - "instance_selection": { - "type": "automatic", - "optimization": "minimum_price" - }, - "boot_volume_attachment": { - "encryption_key": { - "crn": "${bootdrive_crn}" - } - }, - "zones": [ - { - "name": "${zoneName}", - "primary_network_interface": { - "name": "eth0", - "subnet": { - "crn": "${subnetID}" - }, - "security_groups": [ - { - "id": "${securityGroupID}" - } - ] - } - } - ], - "profile_requirement": { - "families": [ - { - "name": "${family}", - "rank": 1, - "profiles": [] - } - ] - } - } -} -EOT - done -done - -# Set permissions for fleet configuration files -chown lsfadmin:root "${fleet_config_dir}/ibmcloudhpc_fleetconfig_"* -chmod 644 "${fleet_config_dir}/ibmcloudhpc_fleetconfig_"* -echo "Fleet configuration files created and updated." - - # 9. create user_data.json for compute nodes - ( - cat < "$LSF_RC_IBMCLOUDHPC_CONF"/user_data.sh - - # TODO: Setting up License Scheduler configurations - # No changes has been advised to be automated - - # 10. Copy user_data.sh from ibmcloudhpc to ibmcloudgen2 - cp $LSF_RC_IBMCLOUDHPC_CONF/user_data.sh $LSF_RC_IC_CONF/user_data.sh - - # Setting up Data Manager configurations - mkdir -p "${LSF_DM_STAGING_AREA}" - chown -R lsfadmin:root "${LSF_DM_STAGING_AREA}" - cat <> "${LSF_CONF}"/lsf.datamanager."${cluster_name}" -Begin Parameters -ADMINS = lsfadmin -STAGING_AREA = "${LSF_DM_STAGING_AREA}" -End Parameters -EOT - - # Uncomment the below line to enable Datamanager - cat <> $LSF_CONF_FILE -#LSF_DATA_HOSTS=${this_hostname} -# LSF_DATA_PORT=1729 -EOT - - echo "LSF configuration end" - ##### LSFSETUP-END ##### - - # Finally ensure ownership for conf files - chown -R lsfadmin:root $LSF_RC_IBMCLOUDHPC_CONF - -else - - # nothing to do on candidate nodes - echo "LSF configuration not to be done on secondary nodes, skipping" - -fi - -########### LSFSETUP-END ################################################################## -########################################################################################### - -echo "Initiating LSF share mount" - -# Function to attempt NFS mount with retries -mount_nfs_with_retries() { - local server_path=$1 - local client_path=$2 - local retries=5 - local success=false - - rm -rf "${client_path}" - mkdir -p "${client_path}" - - for (( j=0; j> $logfile - if mount | grep -q "${client_path}"; then - echo "Mount successful for ${server_path} on ${client_path}" - success=true - break - else - echo "Attempt $((j+1)) of $retries failed for ${server_path} on ${client_path}" - sleep 2 - fi - done - - if [ "$success" = true ]; then - chmod 777 "${client_path}" - echo "${server_path} ${client_path} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" >> /etc/fstab - else - echo "Mount not found for ${server_path} on ${client_path} after $retries attempts." - rm -rf "${client_path}" - fi - - # Convert success to numeric for return - if [ "$success" = true ]; then - return 0 - else - return 1 - fi -} - -# Setup LSF share -if [ -n "${nfs_server_with_mount_path}" ]; then - echo "File share ${nfs_server_with_mount_path} found" - nfs_client_mount_path="/mnt/lsf" - nfs_client_mount_pac_path="${nfs_client_mount_path}/pac" - if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then - # Move stuff to shared fs - for dir in conf work das_staging_area; do - if [ "$on_primary" == "true" ]; then - rm -rf "${nfs_client_mount_path}/$dir" # avoid old data already in shared fs - mv "${LSF_TOP}/$dir" "${nfs_client_mount_path}" # this local data goes to shared fs - else - rm -rf "${LSF_TOP}/$dir" # this local data can go away - fi - ln -fs "${nfs_client_mount_path}/$dir" "${LSF_TOP}" # locally link to shared fs - chown -R lsfadmin:root "${LSF_TOP}" - done - - # Check if LDAP is enabled and the existing LDAP server certificate is provided - if [ "$on_primary" == "true" ] && [ "$enable_ldap" == "true" ] && [ "$ldap_server_cert" != "null" ]; then - mkdir -p /mnt/lsf/openldap - echo "$ldap_server_cert" > /mnt/lsf/openldap/ldap_cacert.pem - chmod 755 /mnt/lsf/openldap/ldap_cacert.pem - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/openldap/certs/ldap_cacert.pem - echo "Configuring with the existing LDAP server. Existing LDAP server certificate found. Proceeding with the setup!" - fi - - # Sharing the lsfsuite.conf folder - if [ "$on_primary" == "true" ] && [ "$enable_app_center" == "true" ] && [ "$app_center_high_availability" == "true" ]; then - # Create pac folder if it does not exist - [ ! -d "${nfs_client_mount_pac_path}" ] && mkdir -p "${nfs_client_mount_pac_path}" - - # Remove the original folder and create symlink for gui-conf - [ -d "${nfs_client_mount_pac_path}/gui-conf" ] && rm -rf "${nfs_client_mount_pac_path}/gui-conf" - mv "${LSF_SUITE_GUI_CONF}" "${nfs_client_mount_pac_path}/gui-conf" - chown -R lsfadmin:root "${nfs_client_mount_pac_path}/gui-conf" && chown -R lsfadmin:lsfadmin "${LSF_SUITE_GUI_CONF}" - ln -fs "${nfs_client_mount_pac_path}/gui-conf" "${LSF_SUITE_GUI_CONF}" - - # Remove the original folder and create symlink for gui-work - [ -d "${nfs_client_mount_pac_path}/gui-work" ] && rm -rf "${nfs_client_mount_pac_path}/gui-work" - mv "${LSF_SUITE_GUI_WORK}" "${nfs_client_mount_pac_path}/gui-work" - chown -R lsfadmin:root "${nfs_client_mount_pac_path}/gui-work" && chown -R lsfadmin:lsfadmin "${LSF_SUITE_GUI_WORK}" - ln -fs "${nfs_client_mount_pac_path}/gui-work" "${LSF_SUITE_GUI_WORK}" - fi - - # Create a data directory for sharing HPC workload data - if [ "$on_primary" == "true" ]; then - mkdir -p "${nfs_client_mount_path}/data" - ln -s "${nfs_client_mount_path}/data" "$LSF_TOP/work/data" - chown -R lsfadmin:root "$LSF_TOP/work/data" - fi - - # VNC Sessions - if [ "$on_primary" == "true" ]; then - mkdir -p "${nfs_client_mount_path}/repository-path" - # With this change, LDAP User can able to submit the job from App Center UI. - chmod -R 777 "${nfs_client_mount_path}/repository-path" - chown -R lsfadmin:root "${nfs_client_mount_path}/repository-path" - fi - - # Create folder in shared file system to store logs - mkdir -p "${nfs_client_mount_path}/log/${HOSTNAME}" - chown -R lsfadmin:root "${nfs_client_mount_path}/log" - if [ "$(ls -A ${LSF_TOP}/log)" ]; then - # Move all existing logs to the new folder - mv ${LSF_TOP}/log/* "${nfs_client_mount_path}/log/${HOSTNAME}" - fi - # Remove the original folder and create symlink so the user can still access to default location - rm -rf "${LSF_TOP}/log" - ln -fs "${nfs_client_mount_path}/log/${HOSTNAME}" "${LSF_TOP}/log" - chown -R lsfadmin:root "${LSF_TOP}/log" - - # Create log folder for pac and set proper owner - mkdir -p "${nfs_client_mount_path}/gui-logs" - chown -R lsfadmin:root "${nfs_client_mount_path}/gui-logs" - # Move PAC logs to shared folder - mkdir -p "${nfs_client_mount_path}/gui-logs/${HOSTNAME}" - if [ -d "${LSF_SUITE_GUI}/logs/${HOSTNAME}" ] && [ "$(ls -A ${LSF_SUITE_GUI}/logs/${HOSTNAME})" ]; then - mv "${LSF_SUITE_GUI}/logs/${HOSTNAME}" "${nfs_client_mount_path}/gui-logs/${HOSTNAME}" - fi - chown -R lsfadmin:root "${nfs_client_mount_path}/gui-logs/${HOSTNAME}" - ln -fs "${nfs_client_mount_path}/gui-logs/${HOSTNAME}" "${LSF_SUITE_GUI}/logs/${HOSTNAME}" - chown -R lsfadmin:root "${LSF_SUITE_GUI}/logs/${HOSTNAME}" - fi -else - echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" - exit 1 -fi -echo "Setting LSF share is completed." - -# Setup Custom file shares -echo "Setting custom file shares." -if [ -n "${custom_file_shares}" ]; then - echo "Custom file share ${custom_file_shares} found" - file_share_array=(${custom_file_shares}) - mount_path_array=(${custom_mount_paths}) - length=${#file_share_array[@]} - - for (( i=0; i> "$LSF_HOSTS_FILE" -else - while [ ! -f "$LSF_HOSTS_FILE" ]; do - echo "Waiting for cluster configuration created by management node to be shared." - sleep 5s - done -fi - -# Update the entry to LSF_HOSTS_FILE -if [ "$on_primary" == "true" ]; then - echo "$login_ip $login_hostname" >> $LSF_HOSTS_FILE - for hostname in $mgmt_hostnames; do - # we map hostnames to ips with DNS, even if we have the ips list already - while true; do - echo "querying DNS: $hostname" - ip="$(dig +short "$hostname.${dns_domain}")" - if [ "$ip" != "" ]; then - sed -i "s/^$ip .*/$ip $hostname/g" $LSF_HOSTS_FILE - break - fi - sleep 2 - done - echo "$hostname $ip added to LSF host file" - done -fi - -for hostname in $mgmt_hostnames; do - while ! grep "$hostname" "$LSF_HOSTS_FILE"; do - echo "Waiting for $hostname to be added to LSF host file" - sleep 5 - done - echo "$hostname found in LSF host file" -done -cat $LSF_HOSTS_FILE >> /etc/hosts - -if [ "$enable_app_center" = true ] && [ "${app_center_high_availability}" = true ]; then - # Add entry for VNC scenario - echo "127.0.0.1 pac pac.$dns_domain" >> /etc/hosts -fi - -# Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -cat < "/etc/lsf.sudoers" -LSF_STARTUP_USERS="lsfadmin" -LSF_STARTUP_PATH=$LSF_TOP_VERSION/linux3.10-glibc2.17-x86_64/etc/ -EOT -chmod 600 /etc/lsf.sudoers -ls -l /etc/lsf.sudoers - -$LSF_TOP_VERSION/install/hostsetup --top="$LSF_TOP" --setuid -echo "Added LSF administrators to start LSF daemons" - -lsfadmin_home_dir="/home/lsfadmin" -echo "source ${LSF_CONF}/profile.lsf" >> /root/.bashrc -echo "source ${LSF_CONF}/profile.lsf" >> "${lsfadmin_home_dir}"/.bashrc - -if [ "$on_primary" == "true" ]; then - # Configure and start perfmon, used for lsf prometheus monitoring - sed -i '/^End Parameters/i SCHED_METRIC_ENABLE=Y' $LSF_CONF/lsbatch/"$cluster_name"/configdir/lsb.params -fi - -echo 'Ready to start daemons' - -# only start after the primary node gives a green-light -if [ "$on_primary" == "true" ]; then - touch /mnt/lsf/config_done -fi -while true; do - [ -f /mnt/lsf/config_done ] && break - echo "waiting, not starting yet" - sleep 3 - ls -l /mnt/lsf /mnt/lsf/config_done 1>/dev/null 2>&1 # creating some NFS activity -done -echo "got green light for starting" - -$LSF_TOP_VERSION/install/hostsetup --top="$LSF_TOP" --boot="y" --start="y" -systemctl status lsfd - -### warning: this dangerously unsets LSF_TOP and LSF_VERSION -source ~/.bashrc - -# Set `do_app_center` based on conditions -do_app_center=false -if [ "$enable_app_center" = true ]; then - if [ "$on_primary" == "true" ] || [ "${app_center_high_availability}" = true ]; then - do_app_center=true - fi -fi - -# Main Application Center configuration block for HPC solution -if [ "$do_app_center" = true ] && [ "$solution" = "hpc" ]; then - if rpm -q lsf-appcenter; then - echo "Application center packages are found..." - echo "${app_center_gui_pwd}" | passwd --stdin lsfadmin - sed -i '$i\\ALLOW_EVENT_TYPE=JOB_NEW JOB_STATUS JOB_FINISH2 JOB_START JOB_EXECUTE JOB_EXT_MSG JOB_SIGNAL JOB_REQUEUE JOB_MODIFY2 JOB_SWITCH METRIC_LOG' $LSF_CONF/lsbatch/"$cluster_name"/configdir/lsb.params - sed -i 's/NEWJOB_REFRESH=y/NEWJOB_REFRESH=Y/g' $LSF_CONF/lsbatch/"$cluster_name"/configdir/lsb.params - - if [ "${app_center_high_availability}" = true ]; then - create_certificate - configure_icd_datasource - fi - - if [ "$on_primary" == "true" ]; then - # Update the Job directory, needed for VNC Sessions - sed -i 's|/home|/mnt/lsf/repository-path|' "$LSF_SUITE_GUI_CONF/Repository.xml" - if [ "${app_center_high_availability}" = true ]; then - echo "LSF_ADDON_HOSTS=\"${mgmt_hostnames}\"" >> $LSF_CONF/lsf.conf - create_appcenter_database - sed -i "s/NoVNCProxyHost=.*/NoVNCProxyHost=pac.${dns_domain}/g" "$LSF_SUITE_GUI_CONF/pmc.conf" - sed -i "s|.*|${mgmt_hostname_primary}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - sed -i "s|.*|pac.${dns_domain}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - else - echo "LSF_ADDON_HOSTS=$HOSTNAME" >> $LSF_CONF/lsf.conf - sed -i 's/NoVNCProxyHost=.*/NoVNCProxyHost=localhost/g' "$LSF_SUITE_GUI_CONF/pmc.conf" - sed -i "s|.*|${mgmt_hostname_primary}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - sed -i "s|.*|localhost|" $LSF_SUITE_GUI_CONF/pnc-config.xml - fi - fi - - echo "source $LSF_SUITE_TOP/ext/profile.platform" >> ~/.bashrc - echo "source $LSF_SUITE_TOP/ext/profile.platform" >> "${lsfadmin_home_dir}"/.bashrc - rm -rf $LSF_SUITE_GUI/3.0/bin/novnc.pem - fi -elif [ "$do_app_center" = true ] && [ "$solution" = "lsf" ]; then - # Alternative configuration block for LSF BYOL scenario - echo "Configuring the App Center for LSF BYOL" - if (( $(ls -ltr /opt/IBM/lsf_app_center_cloud_packages/ | grep "pac" | wc -l) > 0 )); then - echo "Application Center package found!" - LSF_ENVDIR="/opt/ibm/lsf/conf" - echo $LSF_ENVDIR - echo ${app_center_gui_pwd} | sudo passwd --stdin lsfadmin - sed -i '$i\\ALLOW_EVENT_TYPE=JOB_NEW JOB_STATUS JOB_FINISH2 JOB_START JOB_EXECUTE JOB_EXT_MSG JOB_SIGNAL JOB_REQUEUE JOB_MODIFY2 JOB_SWITCH METRIC_LOG' $LSF_CONF/lsbatch/"$cluster_name"/configdir/lsb.params - sed -i 's/NEWJOB_REFRESH=y/NEWJOB_REFRESH=Y/g' $LSF_CONF/lsbatch/"$cluster_name"/configdir/lsb.params - sed -i 's/LSF_DISABLE_LSRUN=Y/LSF_DISABLE_LSRUN=N/g' $LSF_CONF/lsf.conf - echo "LSF_ADDON_HOSTS=\"${mgmt_hostnames}\"" >> $LSF_CONF/lsf.conf - - # Additional configurations for BYOL - sudo systemctl status mariadb -l - - if [ "${app_center_high_availability}" = true ]; then - if [ "$on_primary" == "true" ]; then - sudo mkdir -p /mnt/lsf/lsf_packages - chmod 755 /mnt/lsf/lsf_packages - cp /opt/IBM/lsf_app_center_cloud_packages/pac10.2.0.14_standard_linux-x64.tar.Z /mnt/lsf/lsf_packages - fi - # If we're on a secondary node, copy the package from /mnt/lsf/lsf_packages - if [ "$on_primary" != "true" ]; then - cp /mnt/lsf/lsf_packages/pac10.2.0.14_standard_linux-x64.tar.Z /opt/IBM/lsf_app_center_cloud_packages - fi - fi - - cd /opt/IBM/lsf_app_center_cloud_packages - tar -xvf pac10.2.0.14_standard_linux-x64.tar.Z - cd pac10.2.0.14_standard_linux-x64 - sed -i '1i export SHARED_CONFIGURATION_DIR="/mnt/lsf/pac"' pacinstall.sh - sed -i 's/#\ \.\ $LSF_ENVDIR\/profile\.lsf/. \/opt\/ibm\/lsf\/conf\/profile\.lsf/g' pacinstall.sh - sed -i 's/# export PAC_ADMINS=\"user1 user2\"/export PAC_ADMINS=\"lsfadmin\"/g' pacinstall.sh - - mkdir -p $LSF_CONF/work/"$cluster_name"/logdir/stream - touch $LSF_CONF/work/"$cluster_name"/logdir/stream/lsb.stream - - ./pacinstall.sh -s -y >> $logfile - echo "Sleeping for 10 seconds..." - sleep 10 - - until rpm -qa | grep lsf-appcenter; do - sleep 10 # Check every 10 seconds - done - echo "lsf-appcenter RPM is available, proceeding with configurations..." - - if [ "${app_center_high_availability}" = true ]; then - create_certificate - configure_icd_datasource - fi - - if [ "$on_primary" == "true" ]; then - # Update the Job directory, needed for VNC Sessions - sed -i 's|/home|/mnt/lsf/repository-path|' "$LSF_SUITE_GUI_CONF/Repository.xml" - if [ "${app_center_high_availability}" = true ]; then - echo "LSF_ADDON_HOSTS=\"${mgmt_hostnames}\"" >> $LSF_CONF/lsf.conf - create_appcenter_database - sed -i "s/NoVNCProxyHost=.*/NoVNCProxyHost=pac.${dns_domain}/g" "$LSF_SUITE_GUI_CONF/pmc.conf" - sed -i "s|.*|${mgmt_hostname_primary}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - sed -i "s|.*|pac.${dns_domain}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - else - #echo "LSF_ADDON_HOSTS=$HOSTNAME" >> $LSF_CONF/lsf.conf - sed -i 's/NoVNCProxyHost=.*/NoVNCProxyHost=localhost/g' "$LSF_SUITE_GUI_CONF/pmc.conf" - sed -i "s|.*|${mgmt_hostname_primary}|" $LSF_SUITE_GUI_CONF/pnc-config.xml - sed -i "s|.*|localhost|" $LSF_SUITE_GUI_CONF/pnc-config.xml - fi - fi - - echo "source $LSF_SUITE_TOP/ext/profile.platform" >> ~/.bashrc - echo "source $LSF_SUITE_TOP/ext/profile.platform" >> "${lsfadmin_home_dir}"/.bashrc - rm -rf $LSF_SUITE_GUI/3.0/bin/novnc.pem - source ~/.bashrc - - perfadmin start all; sleep 5; perfadmin list - sleep 10 - pmcadmin start; pmcadmin list - - appcenter_status=$(pmcadmin list | grep "WEBGUI" | awk '{print $2}') - if [ "$appcenter_status" = "STARTED" ]; then - echo "Application Center installation completed..." - else - echo "Application Center installation failed..." - fi - fi -fi - - - -if [ "$do_app_center" = true ] && [ "${solution}" = "hpc" ]; then - # Start all the PerfMonitor and WEBUI processes. - source ~/.bashrc - nohup >/tmp/perfout setsid perfadmin start all; perfadmin list - sleep 5 - nohup >/tmp/pmcout setsid pmcadmin start; pmcadmin list - appcenter_status=$(pmcadmin list | grep "WEBGUI" | awk '{print $2}') - if [ "$appcenter_status" = "STARTED" ]; then - echo "Application Center installation completed..." - else - echo "Application Center installation failed..." - fi -fi - - -# Setup start at boot -# Lsf processes are started by systemctl. -# The script '/root/lsf_start_pac' manages the start of PAC processes if in HA. -if [ "$do_app_center" = "true" ]; then - echo "Configuring the start of the Pac" - cat < /root/lsf_start_pac -#!/bin/sh - -logfile=/tmp/lsf_start_pac.log -echo "\$(date +'%Y%m%d_%H%M%S'): START" > \$logfile - -# Wait mount point just to be sure it is ready -while [ ! mountpoint /mnt/lsf ]; do - sleep 1; -done -echo "\$(date +'%Y%m%d_%H%M%S'): File system '/mnt/lsf' is mounted" >> \$logfile - -# Waiting lsf processes before starting PAC -source ~/.bashrc -RC=1 -x=1 -while [ \$RC -eq 1 ] && [ \$x -le 600 ]; do - lsf_daemons status >> \$logfile; RC=\$? - echo "\$(date +'%Y%m%d_%H%M%S'): RC=\$RC; attempt #\$x" >> \$logfile - x=\$((x+1)) - sleep \$((\$x / 10 + 1)) -done -echo "END" >> \$logfile -perfadmin start all >> \$logfile -sleep 5 -pmcadmin start >> \$logfile -echo "EXIT" >> \$logfile - -EOT - chmod 755 /root/lsf_start_pac - command="/root/lsf_start_pac" - (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - -fi - -# Check if LDAP configuration is enabled -if [ "$enable_ldap" = "true" ]; then - - # Extract and store the major version of the operating system (8 or 9) - version=$(grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}') - - # Proceed if the detected version is either 8 or 9 - if [ "$version" == "8" ] || [ "$version" == "9" ]; then - echo "Detected as RHEL or Rocky $version. Proceeding with LDAP client configuration..." - - # Enable password authentication for SSH by modifying the configuration file - sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - systemctl restart sshd - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries - for attempt in {1..5}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" - mkdir -p /etc/openldap/certs - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/openldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 5 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 5 attempts. Exiting."; exit 1; } - - # Create and configure the SSSD configuration file for LDAP integration - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/openldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - chmod 600 /etc/sssd/sssd.conf - chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS - cat < /etc/openldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/openldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/openldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/openldap/certs - - # Apply the SSSD and home directory creation configuration using authselect - authselect select sssd with-mkhomedir --force - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - systemctl enable --now sssd oddjobd - - # Restart both services to apply the configuration - systemctl restart sssd oddjobd - - # Validate the LDAP configuration by performing a test search using ldapsearch - if ldapsearch -x -H ldap://"${ldap_server_ip}"/ -b "dc=${base_dn%%.*},dc=${base_dn#*.}" > /dev/null; then - echo "LDAP configuration completed successfully!" - else - echo "LDAP configuration failed! Exiting." - exit 1 - fi - - # Ensure LSF commands are available to all users by adding the profile to bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bashrc - source /etc/bashrc - - else - # Exit if an unsupported RHEL version is detected - echo "This script is designed for RHEL 8 or 9. Detected version: $version. Exiting." - exit 1 - fi -fi - -# Manually start perfmon, used by monitoring -# This is not needed, given that SCHED_METRIC_ENABLE=Y -#su - lsfadmin -c "badmin perfmon start" - -# Ensure lsf_prometheus_exporter service to be executed after shared filesystem mount -sed -i 's/After=network-online.target/After=network-online.target mnt-lsf.mount/g' /etc/systemd/system/lsf_prometheus_exporter.service -systemctl daemon-reload - -# Enable LSF prometheus exporter -systemctl enable lsf_prometheus_exporter -systemctl restart lsf_prometheus_exporter - -# Setting up the Metrics Agent -if [ "$observability_monitoring_enable" = true ]; then - - if [ "$cloud_monitoring_access_key" != "" ] && [ "$cloud_monitoring_ingestion_url" != "" ]; then - - SYSDIG_CONFIG_FILE="/opt/draios/etc/dragent.yaml" - PROMETHEUS_CONFIG_FILE="/opt/prometheus/prometheus.yml" - - #packages installation - echo "Writing sysdig config file" - - #sysdig config file - echo "Setting customerid access key" - sed -i "s/==ACCESSKEY==/$cloud_monitoring_access_key/g" $SYSDIG_CONFIG_FILE - sed -i "s/==COLLECTOR==/$cloud_monitoring_ingestion_url/g" $SYSDIG_CONFIG_FILE - echo "tags: type:management,lsf:true" >> $SYSDIG_CONFIG_FILE - - cat < $PROMETHEUS_CONFIG_FILE -global: - scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute. - evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. - # scrape_timeout is set to the global default (10s). - -scrape_configs: - - job_name: "lsf_prometheus_exporter" - static_configs: - - targets: ["localhost:9405"] -remote_write: -- url: "$cloud_monitoring_prws_url" - authorization: - credentials: "$cloud_monitoring_prws_key" -EOTF - - # Enable prometheus - systemctl enable prometheus - systemctl restart prometheus - - echo "Restarting sysdig agent" - systemctl enable dragent - systemctl restart dragent - else - echo "Skipping metrics agent configuration due to missing parameters" - fi -else - echo "Metrics agent configuration skipped since monitoring provisioning is not enabled" -fi - -# Setting up the IBM Cloud Logs -if [ "$observability_logs_enable_for_management" = true ]; then - - echo "Configuring cloud logs for management since observability logs for management is enabled" - sudo cp /root/post-config.sh /opt/ibm - cd /opt/ibm - - cat < /etc/fluent-bit/fluent-bit.conf -[SERVICE] - Flush 1 - Log_Level info - Daemon off - Parsers_File parsers.conf - Plugins_File plugins.conf - HTTP_Server On - HTTP_Listen 0.0.0.0 - HTTP_Port 9001 - Health_Check On - HC_Errors_Count 1 - HC_Retry_Failure_Count 1 - HC_Period 30 - storage.path /fluent-bit/cache - storage.max_chunks_up 192 - storage.metrics On - -[INPUT] - Name syslog - Path /tmp/in_syslog - Buffer_Chunk_Size 32000 - Buffer_Max_Size 64000 - Receive_Buffer_Size 512000 - -[INPUT] - Name tail - Tag * - Path /opt/ibm/lsf/log/*.log - Path_Key file - Exclude_Path /var/log/at/** - DB /opt/ibm/lsf/log/fluent-bit.DB - Buffer_Chunk_Size 32KB - Buffer_Max_Size 256KB - Skip_Long_Lines On - Refresh_Interval 10 - storage.type filesystem - storage.pause_on_chunks_overlimit on - -[FILTER] - Name modify - Match * - Add subsystemName management - Add applicationName lsf - -@INCLUDE output-logs-router-agent.conf -EOL - - sudo chmod +x post-config.sh - sudo ./post-config.sh -h $cloud_logs_ingress_private_endpoint -p "3443" -t "/logs/v1/singles" -a IAMAPIKey -k $VPC_APIKEY_VALUE --send-directly-to-icl -s true -i Production - sudo echo "2024-10-16T14:31:16+0000 INFO Testing IBM Cloud LSF Logs from management: $this_hostname" >> /opt/ibm/lsf/log/test.log - sudo logger -u /tmp/in_syslog my_ident my_syslog_test_message_from_management:$this_hostname - -else - echo "Cloud Logs configuration skipped since observability logs for management is not enabled" -fi - -echo "END $(date '+%Y-%m-%d %H:%M:%S')" -sleep 0.1 # don't race against the log diff --git a/modules/landing_zone_vsi/configuration_steps/management_values.tpl b/modules/landing_zone_vsi/configuration_steps/management_values.tpl deleted file mode 100644 index ad3963e1..00000000 --- a/modules/landing_zone_vsi/configuration_steps/management_values.tpl +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/bash - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -### EXPORT_USER_DATA ### - -#input parameters -VPC_APIKEY_VALUE="${vpc_apikey_value}" -RESOURCE_RECORDS_APIKEY_VALUE="${vpc_apikey_value}" -management_node_count="${management_node_count}" -api_endpoint_eu_de="${api_endpoint_eu_de}" -api_endpoint_us_east="${api_endpoint_us_east}" -api_endpoint_us_south="${api_endpoint_us_south}" -imageID="${image_id}" -subnetID="${subnet_id}" -subnetId="${subnet_id}" -vpcID="${vpc_id}" -securityGroupID="${security_group_id}" -sshkey_ID="${sshkey_id}" -regionName="${region_name}" -zoneName="${zone_name}" -zone="${zone_name}" -# the CIDR block for dyanmic hosts -rc_cidr_block="${rc_cidr_block}" -# the maximum allowed dynamic hosts created by RC -rc_max_num=${rc_max_num} -rc_rg=${rc_rg} -cluster_name="${cluster_name}" -ce_project_guid="${ce_project_guid}" -cluster_prefix="${cluster_prefix}" -cluster_private_key_content="${cluster_private_key_content}" -cluster_public_key_content="${cluster_public_key_content}" -bastion_public_key_content="${bastion_public_key_content}" -hyperthreading="${hyperthreading}" -network_interface=${network_interface} -dns_domain="${dns_domain}" -mount_path="${mount_path}" -custom_file_shares="${custom_file_shares}" -custom_mount_paths="${custom_mount_paths}" -contract_id="${contract_id}" -app_center_gui_pwd="${app_center_gui_pwd}" -enable_app_center="${enable_app_center}" -management_ip=${management_ip} -management_hostname=${management_hostname} -management_cand_ips=${management_cand_ips} -management_cand_hostnames=${management_cand_hostnames} -login_ip=${login_ip} -login_hostname=${login_hostname} -# PAC High Availability -app_center_high_availability="${app_center_high_availability}" -db_adminuser="${db_adminuser}" -db_adminpassword="${db_adminpassword}" -db_hostname="${db_hostname}" -db_port="${db_port}" -db_name="${db_name}" -db_user="${db_user}" -db_password="${db_password}" -db_certificate="${db_certificate}" -# LDAP Server -enable_ldap="${enable_ldap}" -ldap_server_ip="${ldap_server_ip}" -ldap_server_cert="${ldap_server_cert}" -ldap_server_hostname="${ldap_server_hostname}" -ldap_basedns="${ldap_basedns}" -bootdrive_crn="${bootdrive_crn}" -# Observability -observability_monitoring_enable="${observability_monitoring_enable}" -observability_monitoring_on_compute_nodes_enable="${observability_monitoring_on_compute_nodes_enable}" -cloud_monitoring_access_key="${cloud_monitoring_access_key}" -cloud_monitoring_ingestion_url="${cloud_monitoring_ingestion_url}" -cloud_monitoring_prws_key="${cloud_monitoring_prws_key}" -cloud_monitoring_prws_url="${cloud_monitoring_prws_url}" -cloud_logs_ingress_private_endpoint="${cloud_logs_ingress_private_endpoint}" -observability_logs_enable_for_management="${observability_logs_enable_for_management}" -observability_logs_enable_for_compute="${observability_logs_enable_for_compute}" -solution="${solution}" -rc_ncores="${rc_ncores}" -rc_ncpus="${rc_ncpus}" -rc_memInMB="${rc_mem_in_mb }" -rc_profile="${rc_profile}" diff --git a/modules/landing_zone_vsi/datasource.tf b/modules/landing_zone_vsi/datasource.tf index 28a006a9..55b62f4b 100644 --- a/modules/landing_zone_vsi/datasource.tf +++ b/modules/landing_zone_vsi/datasource.tf @@ -1,31 +1,103 @@ -data "ibm_is_image" "management" { - name = var.management_image_name - count = local.image_mapping_entry_found ? 0 : 1 +# data "ibm_resource_group" "existing_resource_group" { +# name = var.existing_resource_group +# } + +data "ibm_is_image" "management_stock_image" { + count = local.image_mapping_entry_found ? 0 : length(var.management_instances) + name = var.management_instances[count.index]["image"] +} + +# data "ibm_is_image" "management" { +# name = var.management_instances[0]["image"] +# count = local.image_mapping_entry_found ? 0 : 1 +# } + +# data "ibm_is_image" "compute" { +# name = var.static_compute_instances[0]["image"] +# count = local.compute_image_found_in_map ? 1 : 0 +# } + +# TODO: Verify distinct profiles +/* +data "ibm_is_instance_profile" "management" { + name = var.management_profile +} + +data "ibm_is_instance_profile" "compute" { + name = var.compute_profile } -data "ibm_is_image" "compute" { - name = var.compute_image_name - count = local.compute_image_from_data ? 1 : 0 +data "ibm_is_instance_profile" "protocol" { + name = var.protocol_profile } +*/ -data "ibm_is_image" "login" { - name = var.login_image_name - count = local.login_image_mapping_entry_found ? 0 : 1 +data "ibm_is_image" "client" { + count = length(var.client_instances) + name = var.client_instances[count.index]["image"] } -data "ibm_is_region" "region" { - name = local.region +data "ibm_is_image" "compute_stock_image" { + count = local.compute_image_found_in_map ? 0 : length(var.static_compute_instances) + name = var.static_compute_instances[count.index]["image"] +} + +data "ibm_is_image" "storage" { + count = length(var.storage_instances) + name = var.storage_instances[count.index]["image"] +} + +# data "ibm_is_image" "protocol" { +# count = length(var.protocol_instances) +# name = var.protocol_instances[count.index]["image"] +# } + + +data "ibm_is_ssh_key" "ssh_keys" { + for_each = toset(var.ssh_keys) + name = each.key } -data "ibm_is_instance_profile" "management_node" { - name = var.management_node_instance_type +data "ibm_is_instance_profile" "storage" { + count = length(var.storage_instances) + name = var.storage_instances[count.index]["profile"] } -data "ibm_is_instance_profile" "worker_node" { - name = var.worker_node_instance_type[0].instance_type +data "ibm_is_instance_profile" "storage_tie_instance" { + count = length(var.storage_instances) + name = var.storage_instances[count.index]["profile"] +} + +data "ibm_is_ssh_key" "gklm" { + for_each = toset(var.gklm_instance_key_pair) + name = each.key +} + +data "ibm_is_ssh_key" "ldap" { + for_each = toset(var.ldap_instance_key_pair) + name = each.key } data "ibm_is_image" "ldap_vsi_image" { - name = var.ldap_vsi_osimage_name - count = var.ldap_basedns != null && var.ldap_server == "null" ? 1 : 0 + count = var.enable_ldap != null && var.ldap_server == "null" ? 1 : 0 + name = var.ldap_instances[count.index]["image"] +} + +data "ibm_is_image" "afm" { + count = length(var.afm_instances) + name = var.afm_instances[count.index]["image"] +} + +data "ibm_is_image" "gklm" { + count = length(var.gklm_instances) + name = var.gklm_instances[count.index]["image"] +} + +data "ibm_is_image" "login_vsi_image" { + count = local.login_image_found_in_map ? 0 : 1 + name = var.login_instance[count.index]["image"] +} + +data "ibm_is_dedicated_host_profiles" "profiles" { + count = var.enable_dedicated_host ? 1 : 0 } diff --git a/modules/landing_zone_vsi/image_map.tf b/modules/landing_zone_vsi/image_map.tf index 9c1277e8..f58ee9d7 100644 --- a/modules/landing_zone_vsi/image_map.tf +++ b/modules/landing_zone_vsi/image_map.tf @@ -1,37 +1,52 @@ locals { image_region_map = { - "hpcaas-lsf10-rhel810-compute-v8" = { - "eu-gb" = "r018-fd4a0927-72df-440c-93f9-f6a325ec90b6" - "eu-de" = "r010-3b541f40-64ab-41f2-ba96-720fd3862a85" - "us-east" = "r014-188b366f-25bb-4545-9bf9-11004bb4a016" - "us-south" = "r006-a99df2a9-5a28-4ba2-b964-0f7e5fd40ac1" - "jp-tok" = "r022-7d1e34af-b876-458a-b4b6-f7b5744ca8db" - "jp-osa" = "r034-a085a1b5-7f70-40a1-9d84-172d844dfbbc" - "au-syd" = "r026-5b600da8-6c93-42e8-9015-48d220180f3b" - "br-sao" = "r042-e8ed8280-b1c1-45ba-9fe2-aa5ece321799" - "ca-tor" = "r038-bbb8e69c-ddd0-42ab-bd74-b39904c4adfe" + "hpc-lsf-fp15-rhel810-v1" = { + "eu-es" = "r050-deeeb734-2523-4aff-96e3-2be8d2b0d634" + "eu-gb" = "r018-8edcd9a1-dbca-462f-bf74-017c15ca4b71" + "eu-de" = "r010-394c5295-1704-4066-b57e-ae9bca1968de" + "us-east" = "r014-1777cdcb-8a68-4ef0-becf-84ec0d2e9a26" + "us-south" = "r006-40caf671-28a8-42c5-b83e-b2ba3ceb86af" + "jp-tok" = "r022-01531301-d100-44ba-b1a3-12e7c8d65469" + "jp-osa" = "r034-ac455775-c667-4d3e-b281-9ef845080599" + "au-syd" = "r026-eff4d59c-5006-46cc-8b03-60514f763a87" + "br-sao" = "r042-1e1bbeeb-3ef7-4f7a-a44c-9f50609bb538" + "ca-tor" = "r038-bb9fcdb7-d200-4cdd-af04-6848007c9cb2" }, - "hpcaas-lsf10-ubuntu2204-compute-v8" = { - "us-east" = "r014-b8deeb5c-90d7-4c07-80a6-d9b130510661" - "eu-de" = "r010-1b56109c-b22c-4fca-91a9-e39e98c8d928" - "us-south" = "r006-eb1e8993-5455-4b98-8a9d-d6e1fe364c08" + "hpc-lsf-fp15-compute-rhel810-v1" = { + "eu-es" = "r050-f0608e39-9dcf-4aca-9e92-7719474b3e86" + "eu-gb" = "r018-db8b97a8-6f87-4cf7-a044-847da6ab5c59" + "eu-de" = "r010-957efd6b-e7b3-4249-8644-6184f1531915" + "us-east" = "r014-5fdd6a25-5943-4084-9c57-b900a80579a3" + "us-south" = "r006-5c0e462a-679c-4a18-81a5-0fe036f483a3" + "jp-tok" = "r022-8087a984-8912-42ff-9576-c5cab8edda3a" + "jp-osa" = "r034-728d1f12-7842-412c-97a0-9deb66c23962" + "au-syd" = "r026-f957ed22-9565-441c-bce6-f716360e02ea" + "br-sao" = "r042-7bf7d508-a7b1-4434-ae6a-6986f7042d4e" + "ca-tor" = "r038-a658da44-f1b4-4e02-826a-38b16e6ae98a" }, - "hpcaas-lsf10-rhel810-v12" = { - "us-east" = "r014-5ae97886-6bcb-4fde-9da3-740a513261a8" - "eu-de" = "r010-1c8df3b1-8def-45eb-82ac-ab2db1612bd9" - "us-south" = "r006-045e03ee-4cfa-4415-a4ec-d8bceadc1bdb" + "hpc-lsf-fp14-rhel810-v1" = { + "eu-es" = "r050-12a3533c-5fa1-4bcc-8765-7150a06e122e" + "eu-gb" = "r018-3ef87e4e-0f46-424a-b623-fa25215094c0" + "eu-de" = "r010-48e5560b-4d34-43ca-b824-2d85513f3188" + "us-east" = "r014-3719a4e2-6746-4eaf-844a-c3721b7c6d32" + "us-south" = "r006-e720ec63-5e8c-46ce-b7a2-51c454e64099" + "jp-tok" = "r022-917ce78b-dacf-4008-b6c0-4058bf59a5b4" + "jp-osa" = "r034-507fb655-4164-45b8-b1d7-f6cb2fbeafc9" + "au-syd" = "r026-01900450-7314-42ea-aee3-acf5179300c0" + "br-sao" = "r042-bb407137-93cf-4ec7-aa77-4702896fff97" + "ca-tor" = "r038-6683403d-1cf5-4f39-a96f-c8cbb2314ad5" }, - "hpc-lsf10-rhel810-v2" = { - "eu-es" = "r050-86c03f46-e10a-4edf-8fcf-103845362db9" - "eu-gb" = "r018-90675b8a-db1b-4a41-b5a0-f21c04cb7d57" - "eu-de" = "r010-dd925c68-d186-406b-a8f7-8d965c60512b" - "us-east" = "r014-4bc87a52-d377-43da-a042-aa1fa1629d28" - "us-south" = "r006-6540f00a-525d-4f62-8a35-f218520b37d2" - "jp-tok" = "r022-02a31841-c5ca-4527-a660-d8e5b1cfb29e" - "jp-osa" = "r034-c7e76920-e735-4702-b04c-1f2cffe170cb" - "au-syd" = "r026-ad5cdb8f-1c44-4267-8969-fe62ac0e93a4" - "br-sao" = "r042-b89b9b8c-a934-4f9d-88bc-b9a15866f223" - "ca-tor" = "r038-d5992a56-ddd1-4156-a98c-54ecef51ae3d" + "hpc-lsf-fp14-compute-rhel810-v1" = { + "eu-es" = "r050-d2ad9625-1668-4b2c-a8bb-6ef14678d3ed" + "eu-gb" = "r018-f1059503-27ec-44d4-a981-21be6225520a" + "eu-de" = "r010-8115b1f6-912e-4b55-89f1-e448c397115e" + "us-east" = "r014-5108884c-011b-4473-b585-0d43309c37e3" + "us-south" = "r006-68c6af72-1abf-4d13-bca1-4f42be5d2c70" + "jp-tok" = "r022-1932c5ec-b5a6-4262-aa56-6c6257c8297f" + "jp-osa" = "r034-50be9bd9-9623-4ffc-8ce7-aab66f674137" + "au-syd" = "r026-11aee148-c938-4524-91e6-8e6da5933a42" + "br-sao" = "r042-5cb62448-e771-4caf-a556-28fdf88acab9" + "ca-tor" = "r038-fa815ec1-d52e-42b2-8221-5b8c2145a248" } } } diff --git a/modules/landing_zone_vsi/locals.tf b/modules/landing_zone_vsi/locals.tf index 70c5e79c..99f1755d 100644 --- a/modules/landing_zone_vsi/locals.tf +++ b/modules/landing_zone_vsi/locals.tf @@ -1,212 +1,399 @@ # define variables locals { + # Future use + # products = "scale" name = "lsf" prefix = var.prefix tags = [local.prefix, local.name] vsi_interfaces = ["eth0", "eth1"] + bms_interfaces = ["ens1", "ens2"] # TODO: explore (DA always keep it true) skip_iam_authorization_policy = true - enable_compute = true - enable_management = true - ldap_node_name = format("%s-%s", local.prefix, "ldap") - login_node_name = format("%s-%s", local.prefix, "login") - management_node_name = format("%s-%s", local.prefix, "mgmt") - worker_node_name = format("%s-%s", local.prefix, "worker") - ldap_enable = var.enable_ldap == true && var.ldap_server == "null" ? 1 : 0 - # enable_worker_vsi = var.solution == "lsf" && var.worker_node_min_count >= 0 ? var.worker_node_min_count : 0 - # products = var.solution == "lsf" && var.enable_app_center ? "lsf,lsf-app-center" : "lsf" - # Region and Zone calculations region = join("-", slice(split("-", var.zones[0]), 0, 2)) - # # TODO: Compute & storage can't be added due to SG rule limitation - /* [ERROR] Error while creating Security Group Rule Exceeded limit of remote rules per security group - (the limit is 5 remote rules per security group)*/ - compute_security_group_rules = [ - { - name = "allow-all-bastion-inbound" - direction = "inbound" - remote = var.bastion_security_group_id - }, - { - name = "allow-port-22-inbound" - direction = "inbound" - remote = var.bastion_security_group_id - tcp = { - port_min = 22 - port_max = 22 - } - }, - { - name = "allow-all-compute-inbound" - direction = "inbound" - remote = module.compute_sg[0].security_group_id_for_ref - }, - { - name = "allow-all-compute-0-inbound" - direction = "inbound" - remote = local.compute_subnets[0].cidr - tcp = { - port_min = 2049 - port_max = 2049 - } - }, - { - name = "allow-all-storage-inbound" - direction = "inbound" - remote = var.storage_security_group_id != null ? var.storage_security_group_id : module.compute_sg[0].security_group_id_for_ref - }, - { - name = "allow-all-bastion-outbound" - direction = "outbound" - remote = var.bastion_security_group_id - }, - { - name = "allow-all-compute-0-outbound" - direction = "outbound" - remote = local.compute_subnets[0].cidr - tcp = { - port_min = 2049 - port_max = 2049 - } - }, - { - name = "allow-all-outbound-outbound" - direction = "outbound" - remote = "0.0.0.0/0" - }, - ] + # management_image_id = data.ibm_is_image.management_stock_image[*].id + # Check whether an entry is found in the mapping file for the given management node image + image_mapping_entry_found = contains(keys(local.image_region_map), var.management_instances[0]["image"]) + new_image_id = local.image_mapping_entry_found ? local.image_region_map[var.management_instances[0]["image"]][local.region] : "Image not found with the given name" - storage_nfs_security_group_rules = [ - { - name = "allow-all-hpcaas-compute-sg" - direction = "inbound" - remote = module.compute_sg[0].security_group_id - } - ] + # compute_image_id = data.ibm_is_image.compute_stock_image[*].id + # Check whether an entry is found in the mapping file for the given compute node image + compute_image_found_in_map = contains(keys(local.image_region_map), var.static_compute_instances[0]["image"]) + # If not found, assume the name is the id already (customer provided image) + new_compute_image_id = local.compute_image_found_in_map ? local.image_region_map[var.static_compute_instances[0]["image"]][local.region] : "Image not found with the given name" - # LDAP security group rule for Cluster - ldap_security_group_rule_for_cluster = [ - { - name = "inbound-rule-for-ldap-node-connection" - direction = "inbound" - remote = var.ldap_server - tcp = { - port_min = 389 - port_max = 389 - } - } - ] + # login_image_id = data.ibm_is_image.login_vsi_image[*].id + # Check whether an entry is found in the mapping file for the given login node image + login_image_found_in_map = contains(keys(local.image_region_map), var.login_instance[0]["image"]) + # If not found, assume the name is the id already (customer provided image) + new_login_image_id = local.login_image_found_in_map ? local.image_region_map[var.login_instance[0]["image"]][local.region] : "Image not found with the given name" - # SSH connection to the Login node via Cluster nodes. - ssh_connection_to_login_node_via_cluster_nodes = [ - { - name = "inbound-rule-for-login-node-ssh-connection" - direction = "inbound" - remote = module.compute_sg[0].security_group_id - tcp = { - port_min = 22 - port_max = 22 - } - } - ] + products = var.scheduler == "Scale" ? "scale" : "lsf" + block_storage_volumes = [for volume in coalesce(var.nsd_details, []) : { + name = format("nsd-%s", index(var.nsd_details, volume) + 1) + profile = volume["profile"] + capacity = volume["capacity"] + iops = volume["iops"] + resource_group = var.resource_group + # TODO: Encryption + # encryption_key = + }] + # TODO: Update the LB configurable + # Bug: 5847 - LB profile & subnets are not configurable + /* + load_balancers = [{ + name = "hpc" + type = "private" + listener_port = 80 + listener_protocol = "http" + connection_limit = 10 + algorithm = "round_robin" + protocol = "http" + health_delay = 60 + health_retries = 5 + health_timeout = 30 + health_type = "http" + pool_member_port = 80 + }] + */ + + client_instance_count = sum(var.client_instances[*]["count"]) + management_instance_count = sum(var.management_instances[*]["count"]) + storage_instance_count = var.storage_type == "persistent" ? sum(var.storage_servers[*]["count"]) : sum(var.storage_instances[*]["count"]) + protocol_instance_count = sum(var.protocol_instances[*]["count"]) + static_compute_instance_count = sum(var.static_compute_instances[*]["count"]) + + enable_client = local.client_instance_count > 0 + enable_management = local.management_instance_count > 0 + enable_compute = local.management_instance_count > 0 || local.static_compute_instance_count > 0 + enable_storage = local.storage_instance_count > 0 + enable_protocol = local.storage_instance_count > 0 && local.protocol_instance_count > 0 + # TODO: Fix the logic + enable_block_storage = var.storage_type == "scratch" ? true : false + + # Future use + # TODO: Fix the logic + # enable_load_balancer = false + + client_node_name = format("%s-%s", local.prefix, "client") + management_node_name = format("%s-%s", local.prefix, "mgmt") + compute_node_name = format("%s-%s", local.prefix, "comp") + storage_node_name = format("%s-%s", local.prefix, "strg") + protocol_node_name = format("%s-%s", local.prefix, "proto") + storage_management_node_name = format("%s-%s", local.prefix, "strg-mgmt") + ldap_node_name = format("%s-%s", local.prefix, "ldap") + afm_node_name = format("%s-%s", local.prefix, "afm") + gklm_node_name = format("%s-%s", local.prefix, "gklm") + cpmoute_management_node_name = format("%s-%s", local.prefix, "comp-mgmt") + login_node_name = format("%s-%s", local.prefix, "login") + + # Future use + /* + management_instance_count = sum(var.management_instances[*]["count"]) + management_instance_profile = flatten([for item in var.management_instances: [ + for count in range(item["count"]) : var.management_instances[index(var.management_instances, item)]["profile"] + ]]) + static_compute_instance_count = sum(var.static_compute_instances[*]["count"]) + storage_instance_count = sum(var.storage_instances[*]["count"]) + protocol_instance_count = sum(var.protocol_instances[*]["count"]) + */ + + # Future use + /* + client_image_name = var.client_image_name + management_image_name = var.management_image_name + compute_image_name = var.compute_image_name + storage_image_name = var.storage_image_name + protocol_image_name = var.storage_image_name + */ + + client_image_id = data.ibm_is_image.client[*].id + storage_image_id = data.ibm_is_image.storage[*].id + protocol_image_id = data.ibm_is_image.storage[*].id + ldap_image_id = data.ibm_is_image.ldap_vsi_image[*].id + afm_image_id = data.ibm_is_image.afm[*].id + gklm_image_id = data.ibm_is_image.gklm[*].id + + ssh_keys = [for name in var.ssh_keys : data.ibm_is_ssh_key.ssh_keys[name].id] + ldap_ssh_keys = [for name in var.ldap_instance_key_pair : data.ibm_is_ssh_key.ldap[name].id] + gklm_ssh_keys = [for name in var.gklm_instance_key_pair : data.ibm_is_ssh_key.gklm[name].id] + + # Future use + /* + # Scale static configs + scale_cloud_deployer_path = "/opt/IBM/ibm-spectrumscale-cloud-deploy" + scale_cloud_install_repo_url = "https://github.com/IBM/ibm-spectrum-scale-cloud-install" + scale_cloud_install_repo_name = "ibm-spectrum-scale-cloud-install" + scale_cloud_install_branch = "5.1.8.1" + scale_cloud_infra_repo_url = "https://github.com/IBM/ibm-spectrum-scale-install-infra" + scale_cloud_infra_repo_name = "ibm-spectrum-scale-install-infra" + scale_cloud_infra_repo_tag = "v2.7.0" + */ + + # Region and Zone calculations + # region = join("-", slice(split("-", var.zones[0]), 0, 2)) + + # TODO: DNS configs + # Security group rules + # client_security_group = local.enable_client ? module.client_sg[0].security_group_id : null + # compute_security_group = local.enable_compute ? module.compute_sg[0].security_group_id : null + # storage_security_group = local.enable_storage ? module.storage_sg[0].security_group_id : null + + # client_security_group_remote = compact([var.bastion_security_group_id]) + # compute_security_group_remote = compact([var.bastion_security_group_id]) + # storage_security_group_remote = compact([var.bastion_security_group_id]) + + # Derived configs + # VPC + # resource_group_id = data.ibm_resource_group.existing_resource_group.id # Subnets # TODO: Multi-zone multi-vNIC VSIs deployment support (bug #https://github.ibm.com/GoldenEye/issues/issues/5830) # Findings: Singe zone multi-vNICs VSIs deployment & multi-zone single vNIC VSIs deployment are supported. - compute_subnets = var.compute_subnets + client_subnets = var.client_subnets + cluster_subnet_id = var.cluster_subnet_id + storage_subnets = var.storage_subnets + protocol_subnets = var.protocol_subnets - # Check whether an entry is found in the mapping file for the given management node image - image_mapping_entry_found = contains(keys(local.image_region_map), var.management_image_name) - new_image_id = local.image_mapping_entry_found ? local.image_region_map[var.management_image_name][local.region] : "Image not found with the given name" + compute_public_key_content = one(module.compute_key[*].public_key_content) + compute_private_key_content = one(module.compute_key[*].private_key_content) - # Check whether an entry is found in the mapping file for the given compute node image - compute_image_found_in_map = contains(keys(local.image_region_map), var.compute_image_name) - # If not found, assume the name is the id already (customer provided image) - new_compute_image_id = local.compute_image_found_in_map ? local.image_region_map[var.compute_image_name][local.region] : var.compute_image_name - compute_image_from_data = !local.compute_image_found_in_map && !startswith(local.new_compute_image_id, "crn:") - - # Check whether an entry is found in the mapping file for the given login node image - login_image_mapping_entry_found = contains(keys(local.image_region_map), var.login_image_name) - new_login_image_id = local.login_image_mapping_entry_found ? local.image_region_map[var.login_image_name][local.region] : "Image not found with the given name" - - compute_node_max_count = 500 - rc_max_num = var.solution == "hpc" ? local.compute_node_max_count : var.worker_node_max_count - vcpus = tonumber(data.ibm_is_instance_profile.worker_node.vcpu_count[0].value) - ncores = local.vcpus / 2 - ncpus = var.hyperthreading_enabled ? local.vcpus : local.ncores - mem_in_mb = tonumber(data.ibm_is_instance_profile.worker_node.memory[0].value) * 1024 - rc_profile = data.ibm_is_instance_profile.worker_node.name - - bastion_subnets = var.bastion_subnets - ldap_server = var.enable_ldap == true && var.ldap_server == "null" ? length(module.ldap_vsi) > 0 ? var.ldap_primary_ip[0] : null : var.ldap_server - ldap_server_cert = var.enable_ldap == true && var.ldap_server_cert != "null" ? var.ldap_server_cert : "null" - ldap_instance_image_id = var.enable_ldap == true && var.ldap_server == "null" ? data.ibm_is_image.ldap_vsi_image[0].id : "null" - - # The below logic is needed to point the API endpoints for the dynanic host creation - us_east = "https://api.us-east.codeengine.cloud.ibm.com/v2beta" - eu_de = "https://api.eu-de.codeengine.cloud.ibm.com/v2beta" - us_south = "https://api.us-south.codeengine.cloud.ibm.com/v2beta" - - # ip/names of vsis - management_vsi_data = flatten(module.management_vsi[*]["list"]) - management_private_ip = local.management_vsi_data[0]["ipv4_address"] - management_hostname = local.management_vsi_data[0]["name"] - - management_candidate_vsi_data = flatten(module.management_candidate_vsi[*]["list"]) - management_candidate_private_ips = local.management_candidate_vsi_data[*]["ipv4_address"] - management_candidate_hostnames = local.management_candidate_vsi_data[*]["name"] - - worker_vsi_data = flatten(module.worker_vsi[*]["list"]) - worker_private_ip = local.worker_vsi_data[*]["ipv4_address"] - - login_vsi_data = flatten(module.login_vsi[*]["list"]) - login_private_ips = local.login_vsi_data[*]["ipv4_address"] - login_hostnames = local.login_vsi_data[*]["name"] - - ldap_vsi_data = flatten(module.ldap_vsi[*]["list"]) - #ldap_private_ips = local.ldap_vsi_data[*]["ipv4_address"] - ldap_hostnames = local.ldap_vsi_data[*]["name"] + # Security Groups + protocol_secondary_security_group = flatten([ + for subnet_index, subnet in local.protocol_subnets : [ + for i in range(var.protocol_instances[subnet_index]["count"]) : { + security_group_id = one(module.storage_sg[*].security_group_id) + interface_name = "${subnet["name"]}-${i}" + } + ] + ]) + # ldap_instance_image_id = var.enable_ldap == true && var.ldap_server == "null" ? data.ibm_is_image.ldap_vsi_image[0].id : "null" } -########################################################################### -# IBM Cloud Dababase for MySQL database local variables -########################################################################### locals { - db_name = "pac" - db_user = "pacuser" -} -## Differentiating VPC File Share and NFS share -locals { - nfs_file_share = [ - for share in var.mount_path : - { - mount_path = share.mount_path - nfs_share = share.nfs_share + # Getting current/available dedicated host profiles + current_dh_profiles = var.enable_dedicated_host ? [for p in data.ibm_is_dedicated_host_profiles.profiles[0].profiles : p if p.status == "current"] : [] + + # Get valid instance profiles from available dedicated hosts + valid_instance_profiles = toset(distinct(flatten([ + for p in local.current_dh_profiles : p.supported_instance_profiles[*].name + ]))) + + # Extract profile family prefix (e.g., "bx2" from "bx2-16x64") + instance_profile_prefixes = distinct([ + for inst in var.static_compute_instances : + regex("^([a-z]+[0-9]+)", inst.profile)[0] + ]) + + # Map instance profile prefixes to available dedicated host profiles + profile_mappings = { + for prefix in local.instance_profile_prefixes : + prefix => { + dh_profiles = [ + for p in local.current_dh_profiles : + p if startswith(p.name, "${prefix}-host") + ] } - if share.mount_path != "/mnt/lsf" && share.nfs_share != null && share.nfs_share != "" + } + + # Validate each instance configuration + validation_results = [ + for inst in var.static_compute_instances : { + profile = inst.profile + profile_prefix = regex("^([a-z]+[0-9]+)", inst.profile)[0] + count = inst.count + instance_valid = contains(local.valid_instance_profiles, inst.profile) + dh_profile_available = length(local.profile_mappings[regex("^([a-z]+[0-9]+)", inst.profile)[0]].dh_profiles) > 0 + } if inst.count > 0 ] - vpc_file_share = [ - for share in var.mount_path : - { - mount_path = share.mount_path - size = share.size - iops = share.iops + # Error messages for invalid configurations + errors = concat( + [ + for vr in local.validation_results : + "ERROR: Dedicated Host for the instance profile '${vr.profile}' is not available in this region" + if !vr.instance_valid + ], + [ + for vr in local.validation_results : + "ERROR: No CURRENT dedicated host profile available for '${vr.profile_prefix}-host-*' (required for '${vr.profile}')" + if vr.instance_valid && !vr.dh_profile_available + ] + ) + + # Create one dedicated host config per instance profile (not per count) + dedicated_host_config = { + for vr in local.validation_results : + vr.profile => { + class = vr.profile_prefix + profile = local.profile_mappings[vr.profile_prefix].dh_profiles[0].name + family = local.profile_mappings[vr.profile_prefix].dh_profiles[0].family + count = vr.count + } + if vr.instance_valid && vr.dh_profile_available + } + + dedicated_host_ids = [ + for instance in var.static_compute_instances : { + profile = instance.profile + id = try(one(module.dedicated_host[instance.profile].dedicated_host_id), "") } - if share.mount_path != "/mnt/lsf" && share.size != null && share.iops != null ] + + dedicated_host_map = { for instance in local.dedicated_host_ids : instance.profile => instance.id } + } +# Validating profile configurations locals { - flattened_worker_nodes = flatten([ - for key, value in var.worker_node_instance_type : [ - for idx in range(value.count) : { - instance_type = value.instance_type - prefix = format("%s-%s-%d", local.worker_node_name, key, idx + 1) - } + should_validate_profile = var.enable_dedicated_host && length(local.errors) > 0 +} + +check "profile_validation" { + assert { + condition = !local.should_validate_profile + error_message = join("\n", concat( + ["Deployment configuration invalid:"], + local.errors, + ["", "Available CURRENT dedicated host profiles in this region:"], + [for p in local.current_dh_profiles : " - ${p.name} (${p.family})"] + )) + } +} + +locals { + + bastion_security_group = var.bastion_security_group_id + # Security group id + client_security_group = local.enable_client ? module.client_sg[0].security_group_id_for_ref : null + compute_security_group = local.enable_compute ? module.compute_sg[0].security_group_id_for_ref : null + storage_security_group = local.enable_storage ? module.storage_sg[0].security_group_id_for_ref : null + + client_security_group_rules = local.enable_client ? (local.enable_compute ? + [ + { name = "client-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "client-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] : + [ + { name = "client-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } ] - ]) + ) : (local.enable_compute ? + [ + { name = "client-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] + : + [ + { name = "client-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] + ) + + compute_security_group_rules = local.enable_client ? (local.enable_compute ? (local.enable_storage ? + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "compute-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "compute-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] : + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "compute-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] + ) : (local.enable_storage ? + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "compute-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] : + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-clientsg-inbound", direction = "inbound", remote = local.client_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] + ) + ) : (local.enable_compute ? (local.enable_storage ? + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "compute-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] : + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] + ) : (local.enable_storage ? + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "compute-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] : + [ + { name = "compute-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr }, + { name = "compute-allow-all-outbound", direction = "outbound", remote = "0.0.0.0/0" } + ] + ) + ) + + storage_security_group_rules = local.enable_compute ? (local.enable_storage ? + [ + { name = "storage-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "storage-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "storage-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] : + [ + { name = "storage-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "storage-allow-computesg-inbound", direction = "inbound", remote = local.compute_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] + ) : (local.enable_storage ? + [ + { name = "storage-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "storage-allow-storagesg-inbound", direction = "inbound", remote = local.storage_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] : + [ + { name = "storage-allow-bastionsg-inbound", direction = "inbound", remote = local.bastion_security_group }, + { name = "client-allow-network-inbound", direction = "inbound", remote = var.cluster_cidr } + ] + ) + + storage_nfs_security_group_rules = [ + { + name = "allow-all-compute-sg" + direction = "inbound" + remote = local.compute_security_group + } + ] + + bastion_security_group_update_rule = local.enable_compute ? [ + { name = "bastion-allow-compute-sg", direction = "inbound", remote = local.compute_security_group } + ] : (local.enable_storage ? [ + { name = "bastion-allow-storage-sg", direction = "inbound", remote = local.storage_security_group } + ] : (local.enable_client ? [ + { name = "bastion-allow-client-sg", direction = "inbound", remote = local.client_security_group }] : [] + )) } diff --git a/modules/landing_zone_vsi/main.tf b/modules/landing_zone_vsi/main.tf index 070410ee..e0b3930f 100644 --- a/modules/landing_zone_vsi/main.tf +++ b/modules/landing_zone_vsi/main.tf @@ -1,7 +1,84 @@ module "compute_key" { count = local.enable_compute ? 1 : 0 source = "./../key" - # private_key_path = "compute_id_rsa" #checkov:skip=CKV_SECRET_6 + # private_key_path = "./../../modules/ansible-roles/compute_id_rsa" #checkov:skip=CKV_SECRET_6 +} + +resource "null_resource" "entitlement_check" { + count = var.scheduler == "Scale" && var.storage_type != "evaluation" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo python3 /opt/IBM/cloud_entitlement/entitlement_check.py --products ${local.products} --icns ${var.ibm_customer_number}" + } + triggers = { + build = timestamp() + } +} + +#Checks the Dedicated host profile and stops the build +resource "null_resource" "dedicated_host_validation" { + count = var.enable_dedicated_host && length(var.static_compute_instances) > 0 && local.should_validate_profile ? 1 : 0 + + provisioner "local-exec" { + command = <> /root/.ssh/config + echo "${local.compute_public_key_content}" >> /root/.ssh/authorized_keys + EOT + } + + triggers = { + build = timestamp() + } +} + +module "storage_key" { + count = local.enable_storage ? 1 : 0 + source = "./../key" + # private_key_path = var.enable_deployer ? "${path.root}/../../modules/ansible-roles/storage_id_rsa" : "${path.root}/modules/ansible-roles/storage_id_rsa" #checkov:skip=CKV_SECRET_6 +} + +module "client_sg" { + count = local.enable_client ? 1 : 0 + source = "terraform-ibm-modules/security-group/ibm" + version = "2.6.2" + add_ibm_cloud_internal_rules = true + resource_group = var.resource_group + security_group_name = format("%s-client-sg", local.prefix) + security_group_rules = local.client_security_group_rules + vpc_id = var.vpc_id } module "compute_sg" { @@ -10,40 +87,24 @@ module "compute_sg" { version = "2.6.2" add_ibm_cloud_internal_rules = true resource_group = var.resource_group - security_group_name = format("%s-cluster-sg", local.prefix) + security_group_name = format("%s-comp-sg", local.prefix) security_group_rules = local.compute_security_group_rules vpc_id = var.vpc_id - tags = local.tags -} - -module "compute_sg_with_ldap_connection" { - count = var.ldap_server == "null" ? 0 : 1 - source = "terraform-ibm-modules/security-group/ibm" - version = "2.6.2" - resource_group = var.resource_group - add_ibm_cloud_internal_rules = true - use_existing_security_group_id = true - existing_security_group_id = module.compute_sg[0].security_group_id - security_group_rules = local.ldap_security_group_rule_for_cluster - vpc_id = var.vpc_id - depends_on = [module.compute_sg] } -module "ssh_connection_to_login_node_via_cluster_nodes" { - count = var.bastion_instance_name != null ? 1 : 0 +module "bastion_sg_existing" { source = "terraform-ibm-modules/security-group/ibm" version = "2.6.2" resource_group = var.resource_group - add_ibm_cloud_internal_rules = true + add_ibm_cloud_internal_rules = false use_existing_security_group_id = true existing_security_group_id = var.bastion_security_group_id - security_group_rules = local.ssh_connection_to_login_node_via_cluster_nodes + security_group_rules = local.bastion_security_group_update_rule vpc_id = var.vpc_id - depends_on = [module.compute_sg] } module "nfs_storage_sg" { - count = var.storage_security_group_id != null ? 1 : 0 + count = var.storage_security_group_id != "" ? 1 : 0 source = "terraform-ibm-modules/security-group/ibm" version = "2.6.2" resource_group = var.resource_group @@ -54,225 +115,379 @@ module "nfs_storage_sg" { vpc_id = var.vpc_id } -module "management_vsi" { - count = 1 +module "storage_sg" { + count = local.enable_storage ? 1 : 0 + source = "terraform-ibm-modules/security-group/ibm" + version = "2.6.2" + add_ibm_cloud_internal_rules = true + resource_group = var.resource_group + security_group_name = format("%s-strg-sg", local.prefix) + security_group_rules = local.storage_security_group_rules + vpc_id = var.vpc_id +} + +module "login_vsi" { + count = var.scheduler == "LSF" ? 1 : 0 source = "terraform-ibm-modules/landing-zone-vsi/ibm" version = "5.0.0" vsi_per_subnet = 1 create_security_group = false security_group = null - image_id = local.image_mapping_entry_found ? local.new_image_id : data.ibm_is_image.management[0].id - machine_type = data.ibm_is_instance_profile.management_node.name - prefix = format("%s-%s", local.management_node_name, count.index + 1) + image_id = local.login_image_found_in_map ? local.new_login_image_id : data.ibm_is_image.login_vsi_image[0].id + machine_type = var.login_instance[count.index]["profile"] + prefix = local.login_node_name resource_group_id = var.resource_group enable_floating_ip = false security_group_ids = module.compute_sg[*].security_group_id - ssh_key_ids = var.compute_ssh_keys - subnets = [local.compute_subnets[0]] + ssh_key_ids = local.ssh_keys + subnets = length(var.bastion_subnets) == 2 ? [var.bastion_subnets[1]] : [var.bastion_subnets[0]] tags = local.tags - user_data = "${data.template_file.management_user_data.rendered} ${file("${path.module}/templates/lsf_management.sh")}" + user_data = data.template_file.login_user_data.rendered vpc_id = var.vpc_id kms_encryption_enabled = var.kms_encryption_enabled skip_iam_authorization_policy = local.skip_iam_authorization_policy boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids + #placement_group_id = var.placement_group_ids[(var.management_instances[count.index]["count"])%(length(var.placement_group_ids))] } -module "management_candidate_vsi" { - count = var.management_node_count - 1 +module "management_vsi" { + count = length(var.management_instances) source = "terraform-ibm-modules/landing-zone-vsi/ibm" version = "5.0.0" + vsi_per_subnet = var.management_instances[count.index]["count"] create_security_group = false security_group = null + image_id = local.image_mapping_entry_found ? local.new_image_id : data.ibm_is_image.management_stock_image[0].id + machine_type = var.management_instances[count.index]["profile"] + prefix = format("%s-%s", local.management_node_name, count.index + 1) + resource_group_id = var.resource_group + enable_floating_ip = false security_group_ids = module.compute_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.cluster_subnet_id + tags = local.tags + user_data = data.template_file.management_user_data.rendered vpc_id = var.vpc_id - ssh_key_ids = var.compute_ssh_keys - subnets = [local.compute_subnets[0]] + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids +} + +module "compute_vsi" { + count = length(var.static_compute_instances) + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.static_compute_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.compute_image_found_in_map ? local.new_compute_image_id : data.ibm_is_image.compute_stock_image[0].id + machine_type = var.static_compute_instances[count.index]["profile"] + prefix = format("%s-%s", local.compute_node_name, count.index + 1) resource_group_id = var.resource_group enable_floating_ip = false - user_data = "${data.template_file.management_user_data.rendered} ${file("${path.module}/templates/lsf_management.sh")}" + security_group_ids = module.compute_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.cluster_subnet_id + tags = local.tags + user_data = var.scheduler == "Scale" ? data.template_file.scale_compute_user_data.rendered : data.template_file.lsf_compute_user_data.rendered + vpc_id = var.vpc_id kms_encryption_enabled = var.kms_encryption_enabled skip_iam_authorization_policy = local.skip_iam_authorization_policy boot_volume_encryption_key = var.boot_volume_encryption_key - image_id = local.image_mapping_entry_found ? local.new_image_id : data.ibm_is_image.management[0].id - prefix = format("%s-%s", local.management_node_name, count.index + 2) - machine_type = data.ibm_is_instance_profile.management_node.name - vsi_per_subnet = 1 - tags = local.tags + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.enable_dedicated_host ? null : var.placement_group_ids + enable_dedicated_host = var.enable_dedicated_host + dedicated_host_id = var.enable_dedicated_host && length(var.static_compute_instances) > 0 ? local.dedicated_host_map[var.static_compute_instances[count.index]["profile"]] : null + depends_on = [module.dedicated_host, null_resource.dedicated_host_validation] } -module "worker_vsi" { - count = length(local.flattened_worker_nodes) +module "compute_cluster_management_vsi" { + count = var.scheduler == "Scale" && local.enable_compute ? 1 : 0 source = "terraform-ibm-modules/landing-zone-vsi/ibm" version = "5.0.0" vsi_per_subnet = 1 create_security_group = false security_group = null - image_id = local.compute_image_found_in_map ? local.new_compute_image_id : data.ibm_is_image.compute[0].id - machine_type = local.flattened_worker_nodes[count.index].instance_type - prefix = format("%s-%s", local.worker_node_name, count.index) + image_id = data.ibm_is_image.compute_stock_image[0].id + machine_type = var.static_compute_instances[count.index]["profile"] + prefix = count.index == 0 ? local.cpmoute_management_node_name : format("%s-%s", local.cpmoute_management_node_name, count.index) resource_group_id = var.resource_group enable_floating_ip = false security_group_ids = module.compute_sg[*].security_group_id - ssh_key_ids = var.compute_ssh_keys - subnets = [local.compute_subnets[0]] + ssh_key_ids = local.ssh_keys + subnets = local.cluster_subnet_id tags = local.tags - user_data = "${data.template_file.worker_user_data.rendered} ${file("${path.module}/templates/static_worker_vsi.sh")}" + user_data = data.template_file.scale_compute_user_data.rendered vpc_id = var.vpc_id kms_encryption_enabled = var.kms_encryption_enabled skip_iam_authorization_policy = local.skip_iam_authorization_policy boot_volume_encryption_key = var.boot_volume_encryption_key - enable_dedicated_host = var.enable_dedicated_host - dedicated_host_id = var.dedicated_host_id - depends_on = [module.management_vsi, module.do_management_vsi_configuration] + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids } -module "login_vsi" { - # count = 1 +module "storage_vsi" { + count = length(var.storage_instances) > 0 && var.storage_type != "persistent" ? 1 : 0 + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.storage_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.storage_image_id[count.index] + machine_type = var.storage_instances[count.index]["profile"] + prefix = count.index == 0 ? local.storage_node_name : format("%s-%s", local.storage_node_name, count.index) + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.storage_subnets + tags = local.tags + user_data = data.template_file.storage_user_data.rendered + vpc_id = var.vpc_id + block_storage_volumes = local.enable_block_storage ? local.block_storage_volumes : [] + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids + secondary_allow_ip_spoofing = local.enable_protocol && var.colocate_protocol_instances ? true : false + secondary_security_groups = local.protocol_secondary_security_group + secondary_subnets = local.enable_protocol && var.colocate_protocol_instances ? local.protocol_subnets : [] + manage_reserved_ips = local.enable_protocol && var.colocate_protocol_instances ? true : false + primary_vni_additional_ip_count = local.enable_protocol && var.colocate_protocol_instances ? var.protocol_instances[count.index]["count"] : 0 + depends_on = [resource.null_resource.entitlement_check] + # manage_reserved_ips = true + # primary_vni_additional_ip_count = var.storage_instances[count.index]["count"] + # placement_group_id = var.placement_group_ids[(var.storage_instances[count.index]["count"])%(length(var.placement_group_ids))] +} + + +module "storage_cluster_management_vsi" { + count = length(var.storage_instances) source = "terraform-ibm-modules/landing-zone-vsi/ibm" version = "5.0.0" vsi_per_subnet = 1 create_security_group = false security_group = null - image_id = local.login_image_mapping_entry_found ? local.new_login_image_id : data.ibm_is_image.login[0].id - machine_type = var.login_node_instance_type - prefix = local.login_node_name + image_id = local.storage_image_id[count.index] + machine_type = var.management_instances[count.index]["profile"] + prefix = count.index == 0 ? local.storage_management_node_name : format("%s-%s", local.storage_management_node_name, count.index) resource_group_id = var.resource_group enable_floating_ip = false - security_group_ids = [var.bastion_security_group_id] - ssh_key_ids = var.bastion_ssh_keys - subnets = length(var.bastion_subnets) == 2 ? [local.bastion_subnets[1]] : [local.bastion_subnets[0]] + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.storage_subnets tags = local.tags - user_data = "${data.template_file.login_user_data.rendered} ${file("${path.module}/templates/login_vsi.sh")}" + user_data = data.template_file.storage_user_data.rendered vpc_id = var.vpc_id + block_storage_volumes = local.enable_block_storage ? local.block_storage_volumes : [] kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy boot_volume_encryption_key = var.boot_volume_encryption_key - skip_iam_authorization_policy = var.bastion_instance_name != null ? false : local.skip_iam_authorization_policy existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids + depends_on = [resource.null_resource.entitlement_check] + #placement_group_id = var.placement_group_ids[(var.storage_instances[count.index]["count"])%(length(var.placement_group_ids))] } -module "ldap_vsi" { - count = local.ldap_enable +module "storage_cluster_tie_breaker_vsi" { + count = var.storage_type != "persistent" ? 1 : 0 source = "terraform-ibm-modules/landing-zone-vsi/ibm" version = "5.0.0" vsi_per_subnet = 1 create_security_group = false security_group = null - image_id = local.ldap_instance_image_id - machine_type = var.ldap_vsi_profile - prefix = local.ldap_node_name + image_id = local.storage_image_id[count.index] + machine_type = var.storage_instances[count.index]["profile"] + prefix = format("%s-strg-tie", local.prefix) resource_group_id = var.resource_group enable_floating_ip = false - security_group_ids = module.compute_sg[*].security_group_id - ssh_key_ids = var.compute_ssh_keys - subnets = [local.compute_subnets[0]] + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.storage_subnets #[local.storage_subnets[0]] tags = local.tags - user_data = var.enable_ldap == true && var.ldap_server == "null" ? "${data.template_file.ldap_user_data[0].rendered} ${file("${path.module}/templates/ldap_user_data.sh")}" : "" + user_data = data.template_file.storage_user_data.rendered vpc_id = var.vpc_id + block_storage_volumes = local.enable_block_storage ? local.block_storage_volumes : [] kms_encryption_enabled = var.kms_encryption_enabled skip_iam_authorization_policy = local.skip_iam_authorization_policy boot_volume_encryption_key = var.boot_volume_encryption_key - #placement_group_id = var.placement_group_ids[(var.management_instances[count.index]["count"])%(length(var.placement_group_ids))] + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids + # manage_reserved_ips = true + # primary_vni_additional_ip_count = var.storage_instances[count.index]["count"] + # placement_group_id = var.placement_group_ids[(var.storage_instances[count.index]["count"])%(length(var.placement_group_ids))] } - -module "generate_db_password" { - count = var.enable_app_center && var.app_center_high_availability ? 1 : 0 - source = "../../modules/security/password" - length = 15 - special = true - override_special = "-_" - min_numeric = 1 +module "client_vsi" { + count = length(var.client_instances) + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.client_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.client_image_id[count.index] + machine_type = var.client_instances[count.index]["profile"] + prefix = count.index == 0 ? local.client_node_name : format("%s-%s", local.client_node_name, count.index) + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.client_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.client_subnets + tags = local.tags + user_data = data.template_file.client_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + depends_on = [resource.null_resource.entitlement_check] } -module "ssh_key" { - source = "./../key" +module "protocol_vsi" { + count = var.colocate_protocol_instances == true ? 0 : length(var.protocol_instances) + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.protocol_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.protocol_image_id[count.index] + machine_type = var.protocol_instances[count.index]["profile"] + prefix = count.index == 0 ? local.protocol_node_name : format("%s-%s", local.protocol_node_name, count.index) + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.storage_subnets + tags = local.tags + user_data = data.template_file.protocol_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + # Bug: 5847 - LB profile & subnets are not configurable + # load_balancers = local.enable_load_balancer ? local.load_balancers : [] + secondary_allow_ip_spoofing = true + secondary_security_groups = local.protocol_secondary_security_group + secondary_subnets = local.protocol_subnets + placement_group_id = var.placement_group_ids + manage_reserved_ips = true + primary_vni_additional_ip_count = var.protocol_instances[count.index]["count"] + depends_on = [resource.null_resource.entitlement_check] + # placement_group_id = var.placement_group_ids[(var.protocol_instances[count.index]["count"])%(length(var.placement_group_ids))] } -module "wait_management_vsi_booted" { - source = "./../../modules/null/remote_exec" - cluster_host = concat([local.management_private_ip]) - cluster_user = var.cluster_user #"root" - cluster_private_key = var.compute_private_key_content - login_host = var.bastion_fip - login_user = "ubuntu" - login_private_key = var.bastion_private_key_content - command = ["cloud-init status --wait;hostname;date;df;id"] - timeout = "15m" # let's be patient, the VSI may need time to boot completely - depends_on = [ - module.management_vsi - ] +module "afm_vsi" { + count = length(var.afm_instances) + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.afm_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.afm_image_id[count.index] + machine_type = var.afm_instances[count.index]["profile"] + prefix = count.index == 0 ? local.afm_node_name : format("%s-%s", local.afm_node_name, count.index) + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.ssh_keys + subnets = local.storage_subnets + tags = local.tags + user_data = data.template_file.afm_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + # manage_reserved_ips = true + # primary_vni_additional_ip_count = var.afm_instances[count.index]["count"] } -module "wait_management_candidate_vsi_booted" { - source = "./../../modules/null/remote_exec" - cluster_host = concat(local.management_candidate_private_ips) - cluster_user = var.cluster_user #"root" - cluster_private_key = var.compute_private_key_content - login_host = var.bastion_fip - login_user = "ubuntu" - login_private_key = var.bastion_private_key_content - command = ["cloud-init status --wait;hostname;date;df;id"] - timeout = "15m" # let's be patient, the VSI may need time to boot completely - depends_on = [ - module.management_candidate_vsi - ] +module "gklm_vsi" { + count = var.scale_encryption_enabled == true && var.scale_encryption_type == "gklm" ? 1 : 0 + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = var.gklm_instances[count.index]["count"] + create_security_group = false + security_group = null + image_id = local.gklm_image_id[count.index] + machine_type = var.gklm_instances[count.index]["profile"] + prefix = count.index == 0 ? local.gklm_node_name : format("%s-%s", local.gklm_node_name, count.index) + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = module.storage_sg[*].security_group_id + ssh_key_ids = local.gklm_ssh_keys + subnets = local.storage_subnets + tags = local.tags + user_data = data.template_file.gklm_user_data.rendered + vpc_id = var.vpc_id + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid } -module "wait_worker_vsi_booted" { - count = var.solution == "lsf" ? 1 : 0 - source = "./../../modules/null/remote_exec" - cluster_host = concat(local.worker_private_ip) - cluster_user = var.cluster_user #"root" - cluster_private_key = var.compute_private_key_content - login_host = var.bastion_fip - login_user = "ubuntu" - login_private_key = var.bastion_private_key_content - command = ["cloud-init status --wait;hostname;date;df;id"] - timeout = "15m" # let's be patient, the VSI may need time to boot completely - depends_on = [ - module.management_candidate_vsi, - module.wait_management_vsi_booted - ] +module "ldap_vsi" { + count = var.enable_ldap == true && var.ldap_server == "null" ? 1 : 0 + source = "terraform-ibm-modules/landing-zone-vsi/ibm" + version = "5.0.0" + vsi_per_subnet = 1 + create_security_group = false + security_group = null + image_id = local.ldap_image_id[count.index] + machine_type = var.ldap_instances[count.index]["profile"] + prefix = local.ldap_node_name + resource_group_id = var.resource_group + enable_floating_ip = false + security_group_ids = local.products == "lsf" ? module.compute_sg[*].security_group_id : module.storage_sg[*].security_group_id + ssh_key_ids = local.products == "lsf" ? local.ssh_keys : local.ldap_ssh_keys + subnets = local.products == "lsf" ? local.cluster_subnet_id : [local.storage_subnets[0]] + tags = local.tags + user_data = data.template_file.ldap_user_data.rendered + vpc_id = var.vpc_id + block_storage_volumes = local.enable_block_storage ? local.block_storage_volumes : [] + kms_encryption_enabled = var.kms_encryption_enabled + skip_iam_authorization_policy = local.skip_iam_authorization_policy + boot_volume_encryption_key = var.boot_volume_encryption_key + existing_kms_instance_guid = var.existing_kms_instance_guid + placement_group_id = var.placement_group_ids + #placement_group_id = var.placement_group_ids[(var.storage_instances[count.index]["count"])%(length(var.placement_group_ids))] } -module "do_management_vsi_configuration" { - source = "./../../modules/null/remote_exec_script" - cluster_host = concat([local.management_private_ip]) - cluster_user = var.cluster_user #"root" - cluster_private_key = var.compute_private_key_content - login_host = var.bastion_fip - login_user = "ubuntu" - login_private_key = var.bastion_private_key_content - payload_files = ["${path.module}/configuration_steps/configure_management_vsi.sh", "${path.module}/configuration_steps/compute_user_data_fragment.sh"] - payload_dirs = [] - new_file_name = "management_values" - new_file_content = data.template_file.management_values.rendered - script_to_run = "configure_management_vsi.sh" - sudo_user = "root" - with_bash = true - depends_on = [ - module.wait_management_vsi_booted - ] - trigger_string = join(",", module.management_vsi[0].ids) +######################################################################## +### Dedicated Host ### +######################################################################## +module "dedicated_host" { + for_each = var.enable_dedicated_host ? local.dedicated_host_config : {} + source = "../dedicated_host" + prefix = var.prefix + zone = var.zones + existing_host_group = false + class = each.value.class + profile = each.value.profile + family = each.value.family + resource_group_id = var.resource_group + depends_on = [null_resource.dedicated_host_validation] } -module "do_management_candidate_vsi_configuration" { - source = "./../../modules/null/remote_exec_script" - cluster_host = concat(local.management_candidate_private_ips) - cluster_user = var.cluster_user #"root" - cluster_private_key = var.compute_private_key_content - login_host = var.bastion_fip - login_user = "ubuntu" - login_private_key = var.bastion_private_key_content - payload_files = ["${path.module}/configuration_steps/configure_management_vsi.sh"] - payload_dirs = [] - new_file_name = "management_values" - new_file_content = data.template_file.management_values.rendered - script_to_run = "configure_management_vsi.sh" - sudo_user = "root" - with_bash = true - depends_on = [ - module.wait_management_candidate_vsi_booted - ] - trigger_string = join(",", flatten(module.management_candidate_vsi[*].ids)) +######################################################################## +### Baremetal Module ### +######################################################################## + +module "storage_baremetal" { + + count = length(var.storage_servers) > 0 && var.storage_type == "persistent" ? 1 : 0 + source = "../baremetal" + existing_resource_group = var.resource_group + prefix = var.prefix + storage_subnets = [for subnet in local.storage_subnets : subnet.id] + storage_ssh_keys = local.ssh_keys + storage_servers = var.storage_servers + security_group_ids = module.storage_sg[*].security_group_id + bastion_public_key_content = var.bastion_public_key_content } diff --git a/modules/landing_zone_vsi/outputs.tf b/modules/landing_zone_vsi/outputs.tf index f53c0f49..c8d3523d 100644 --- a/modules/landing_zone_vsi/outputs.tf +++ b/modules/landing_zone_vsi/outputs.tf @@ -1,11 +1,21 @@ +output "client_vsi_data" { + description = "client VSI data" + value = module.client_vsi[*]["list"] +} + output "management_vsi_data" { description = "Management VSI data" value = module.management_vsi[*]["list"] } -output "management_candidate_vsi_data" { - description = "Management candidate VSI data" - value = module.management_candidate_vsi[*]["list"] +output "compute_vsi_data" { + description = "Compute VSI data" + value = module.compute_vsi[*]["list"] +} + +output "compute_management_vsi_data" { + description = "Compute Management VSI data" + value = module.compute_cluster_management_vsi[*]["list"] } output "login_vsi_data" { @@ -13,24 +23,24 @@ output "login_vsi_data" { value = module.login_vsi[*]["list"] } -output "ldap_vsi_data" { - description = "Login VSI data" - value = module.ldap_vsi[*]["list"] +output "storage_vsi_data" { + description = "Storage VSI data" + value = module.storage_vsi[*]["list"] } -output "worker_vsi_data" { - description = "Static worker VSI data" - value = module.worker_vsi[*]["list"] +output "storage_bms_data" { + description = "Storage BareMetal Server data" + value = flatten(module.storage_baremetal[*].list) } -output "image_map_entry_found" { - description = "Available if the image name provided is located within the image map" - value = "${local.image_mapping_entry_found} -- - ${var.management_image_name}" +output "storage_cluster_management_vsi" { + description = "Storage Management VSI data" + value = module.storage_cluster_management_vsi[*]["list"] } -output "ldap_server" { - description = "LDAP server IP" - value = local.ldap_server +output "protocol_vsi_data" { + description = "Protocol VSI data" + value = module.protocol_vsi[*]["list"] } output "compute_sg_id" { @@ -40,12 +50,56 @@ output "compute_sg_id" { output "compute_public_key_content" { description = "Compute public key content" - value = one(module.compute_key[*].private_key_content) sensitive = true + value = one(module.compute_key[*].public_key_content) } output "compute_private_key_content" { description = "Compute private key content" + sensitive = true value = one(module.compute_key[*].private_key_content) +} + +output "afm_vsi_data" { + description = "AFM VSI data" + value = module.afm_vsi[*]["list"] +} + +output "gklm_vsi_data" { + description = "GKLM VSI data" + value = module.gklm_vsi[*]["list"] +} + +output "ldap_vsi_data" { + description = "LDAP VSI data" + value = module.ldap_vsi[*]["list"] +} + +output "storage_cluster_tie_breaker_vsi_data" { + description = "Storage Cluster Tie Breaker VSI data" + value = module.storage_cluster_tie_breaker_vsi[*]["list"] +} + +output "instance_ips_with_vol_mapping" { + description = "Storage instance ips with vol mapping" + value = try({ for instance_details in flatten([for name_details in(flatten(module.storage_vsi[*]["list"])[*]["name"]) : name_details]) : instance_details => + data.ibm_is_instance_profile.storage[0].disks[0].quantity[0].value == 1 ? ["/dev/vdb"] : ["/dev/vdb", "/dev/vdc"] }, {}) +} + +output "instance_ips_with_vol_mapping_tie_breaker" { + description = "Tie breaker instance ips with vol mapping" + value = try({ for instance_details in flatten([for name_details in(flatten(module.storage_cluster_tie_breaker_vsi[*]["list"])[*]["name"]) : name_details]) : instance_details => + data.ibm_is_instance_profile.storage_tie_instance[0].disks[0].quantity[0].value == 1 ? ["/dev/vdb"] : ["/dev/vdb", "/dev/vdc"] }, {}) +} + +output "storage_private_key_content" { + description = "Storage private key content" + value = try(module.storage_key[0].private_key_content, "") + sensitive = true +} + +output "storage_public_key_content" { + description = "Storage public key content" + value = try(module.storage_key[0].public_key_content, "") sensitive = true } diff --git a/modules/landing_zone_vsi/template_files.tf b/modules/landing_zone_vsi/template_files.tf index 18417a10..6b1cc54f 100644 --- a/modules/landing_zone_vsi/template_files.tf +++ b/modules/landing_zone_vsi/template_files.tf @@ -1,156 +1,121 @@ +data "template_file" "ldap_user_data" { + template = file("${path.module}/templates/ldap_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + compute_public_key_content = local.enable_compute ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + compute_private_key_content = local.enable_compute ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + compute_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + compute_dns_domain = var.dns_domain_names["compute"] + } +} + +data "template_file" "client_user_data" { + template = file("${path.module}/templates/client_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + client_public_key_content = local.enable_client ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + client_private_key_content = local.enable_client ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + client_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + client_dns_domain = var.dns_domain_names["compute"] + } +} + data "template_file" "management_user_data" { template = file("${path.module}/templates/management_user_data.tpl") vars = { - management_node_count = var.management_node_count - rc_cidr_block = local.compute_subnets[0].cidr - cluster_prefix = var.prefix - cluster_private_key_content = local.enable_management ? module.compute_key[0].private_key_content : "" - cluster_public_key_content = local.enable_management ? module.compute_key[0].public_key_content : "" - hyperthreading = var.hyperthreading_enabled - network_interface = local.vsi_interfaces[0] - dns_domain = var.dns_domain_names["compute"] - mount_path = var.share_path - enable_ldap = var.enable_ldap - ldap_server_ip = local.ldap_server - ldap_server_cert = local.ldap_server_cert - ldap_basedns = var.enable_ldap == true ? var.ldap_basedns : "null" - login_ip_address = var.login_private_ips + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + management_public_key_content = local.enable_management ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + management_private_key_content = local.enable_management ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + management_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + management_dns_domain = var.dns_domain_names["compute"] + } +} + +data "template_file" "lsf_compute_user_data" { + template = file("${path.module}/templates/lsf_compute_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + management_public_key_content = local.enable_compute ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + management_private_key_content = local.enable_compute ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + management_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + management_dns_domain = var.dns_domain_names["compute"] + # TODO: Fix me + dynamic_compute_instances = var.dynamic_compute_instances == null ? "" : "" } } data "template_file" "login_user_data" { template = file("${path.module}/templates/login_user_data.tpl") vars = { - network_interface = local.vsi_interfaces[0] - dns_domain = var.dns_domain_names["compute"] - cluster_private_key_content = local.enable_management ? module.compute_key[0].private_key_content : "" - cluster_public_key_content = local.enable_management ? module.compute_key[0].public_key_content : "" - mount_path = var.share_path - custom_mount_paths = join(" ", concat(local.vpc_file_share[*]["mount_path"], local.nfs_file_share[*]["mount_path"])) - custom_file_shares = join(" ", concat([for file_share in var.file_share : file_share], local.nfs_file_share[*]["nfs_share"])) - enable_ldap = var.enable_ldap - rc_cidr_block = local.bastion_subnets[0].cidr - cluster_prefix = var.prefix - rc_cidr_block_1 = local.compute_subnets[0].cidr - hyperthreading = var.hyperthreading_enabled - ldap_server_ip = local.ldap_server - ldap_basedns = var.enable_ldap == true ? var.ldap_basedns : "null" + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + login_public_key_content = local.enable_compute ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + login_private_key_content = local.enable_compute ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + login_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + login_dns_domain = var.dns_domain_names["compute"] + scheduler = var.scheduler } } -data "template_file" "ldap_user_data" { - count = var.enable_ldap == true ? 1 : 0 - template = file("${path.module}/templates/ldap_user_data.tpl") +data "template_file" "scale_compute_user_data" { + template = file("${path.module}/templates/scale_compute_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + compute_public_key_content = local.enable_compute ? local.compute_public_key_content != null ? local.compute_public_key_content : "" : "" + compute_private_key_content = local.enable_compute ? local.compute_private_key_content != null ? local.compute_private_key_content : "" : "" + compute_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + compute_dns_domain = var.dns_domain_names["compute"] + } +} + +data "template_file" "storage_user_data" { + template = file("${path.module}/templates/storage_user_data.tpl") + vars = { + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + storage_public_key_content = local.enable_storage ? module.storage_key[0].public_key_content : "" + storage_private_key_content = local.enable_storage ? module.storage_key[0].private_key_content : "" + storage_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + storage_dns_domain = local.enable_storage ? var.dns_domain_names["storage"] : "" + storage_disk_type = var.storage_type == "scratch" ? data.ibm_is_instance_profile.storage[0].disks[0].quantity[0].type : "" + protocol_dns_domain = local.enable_protocol ? var.dns_domain_names["protocol"] : "" + enable_protocol = local.enable_protocol + vpc_region = var.vpc_region + resource_group_id = var.resource_group + protocol_subnets = local.enable_protocol ? local.protocol_subnets[0].id : "" + } +} + +data "template_file" "protocol_user_data" { + template = file("${path.module}/templates/protocol_user_data.tpl") vars = { - ssh_public_key_content = local.enable_management ? module.compute_key[0].public_key_content : "" - ldap_basedns = var.ldap_basedns - ldap_admin_password = var.ldap_admin_password - cluster_prefix = var.prefix - ldap_user = var.ldap_user_name - ldap_user_password = var.ldap_user_password - mount_path = var.share_path - dns_domain = var.dns_domain_names["compute"] + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + storage_public_key_content = local.enable_protocol ? module.storage_key[0].public_key_content : "" + storage_private_key_content = local.enable_protocol ? module.storage_key[0].private_key_content : "" + storage_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + protocol_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[1] : local.bms_interfaces[1] + storage_dns_domain = local.enable_storage ? var.dns_domain_names["storage"] : "" + protocol_dns_domain = local.enable_protocol ? var.dns_domain_names["protocol"] : "" + vpc_region = var.vpc_region + resource_group_id = var.resource_group + protocol_subnets = local.enable_protocol ? local.protocol_subnets[0].id : "" } } -data "template_file" "worker_user_data" { - template = file("${path.module}/templates/static_worker_user_data.tpl") +data "template_file" "afm_user_data" { + template = file("${path.module}/templates/afm_user_data.tpl") vars = { - network_interface = local.vsi_interfaces[0] - dns_domain = var.dns_domain_names["compute"] - cluster_private_key_content = local.enable_management ? module.compute_key[0].private_key_content : "" - cluster_public_key_content = local.enable_management ? module.compute_key[0].public_key_content : "" - mount_path = var.share_path - custom_mount_paths = join(" ", concat(local.vpc_file_share[*]["mount_path"], local.nfs_file_share[*]["mount_path"])) - custom_file_shares = join(" ", concat([for file_share in var.file_share : file_share], local.nfs_file_share[*]["nfs_share"])) - enable_ldap = var.enable_ldap - rc_cidr_block = local.compute_subnets[0].cidr - cluster_prefix = var.prefix - hyperthreading = var.hyperthreading_enabled - ldap_server_ip = local.ldap_server - ldap_basedns = var.enable_ldap == true ? var.ldap_basedns : "null" - cluster_name = var.cluster_id - management_hostname = local.management_hostname - observability_monitoring_enable = var.observability_monitoring_enable - observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable - cloud_monitoring_access_key = var.cloud_monitoring_access_key - cloud_monitoring_ingestion_url = var.cloud_monitoring_ingestion_url - cloud_logs_ingress_private_endpoint = var.cloud_logs_ingress_private_endpoint - observability_logs_enable_for_compute = var.observability_logs_enable_for_compute - VPC_APIKEY_VALUE = var.ibmcloud_api_key + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + storage_public_key_content = local.enable_storage ? module.storage_key[0].public_key_content : "" + storage_private_key_content = local.enable_storage ? module.storage_key[0].private_key_content : "" + storage_interfaces = var.storage_type == "scratch" ? local.vsi_interfaces[0] : local.bms_interfaces[0] + storage_dns_domain = local.enable_storage ? var.dns_domain_names["storage"] : "" } } -data "template_file" "management_values" { - template = file("${path.module}/configuration_steps/management_values.tpl") +data "template_file" "gklm_user_data" { + template = file("${path.module}/templates/gklm_user_data.tpl") vars = { - bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" - vpc_apikey_value = var.ibmcloud_api_key - resource_records_apikey_value = var.ibmcloud_api_key - management_node_count = var.management_node_count - api_endpoint_us_east = local.us_east - api_endpoint_eu_de = local.eu_de - api_endpoint_us_south = local.us_south - image_id = local.compute_image_from_data ? data.ibm_is_image.compute[0].id : local.new_compute_image_id - subnet_id = local.compute_subnets[0].crn - security_group_id = module.compute_sg[0].security_group_id - sshkey_id = join(",", var.compute_ssh_keys) - region_name = data.ibm_is_region.region.name - zone_name = var.zones[0] - vpc_id = var.vpc_id - rc_cidr_block = local.compute_subnets[0].cidr - rc_max_num = local.rc_max_num - rc_rg = var.resource_group - cluster_name = var.cluster_id - ce_project_guid = var.ce_project_guid - cluster_prefix = var.prefix - cluster_private_key_content = local.enable_management ? module.compute_key[0].private_key_content : "" - cluster_public_key_content = local.enable_management ? module.compute_key[0].public_key_content : "" - hyperthreading = var.hyperthreading_enabled - network_interface = local.vsi_interfaces[0] - dns_domain = var.dns_domain_names["compute"] - mount_path = var.share_path - custom_mount_paths = join(" ", concat(local.vpc_file_share[*]["mount_path"], local.nfs_file_share[*]["mount_path"])) - custom_file_shares = join(" ", concat([for file_share in var.file_share : file_share], local.nfs_file_share[*]["nfs_share"])) - contract_id = var.solution == "hpc" ? var.contract_id : "" - enable_app_center = var.enable_app_center - app_center_gui_pwd = var.app_center_gui_pwd - enable_ldap = var.enable_ldap - ldap_server_ip = local.ldap_server - ldap_server_cert = local.ldap_server_cert - ldap_server_hostname = length(local.ldap_hostnames) > 0 ? local.ldap_hostnames[0] : "null" - ldap_basedns = var.enable_ldap == true ? var.ldap_basedns : "null" - bootdrive_crn = var.boot_volume_encryption_key == null ? "" : var.boot_volume_encryption_key - management_ip = local.management_private_ip - management_hostname = local.management_hostname - management_cand_ips = join(",", local.management_candidate_private_ips) - management_cand_hostnames = join(",", local.management_candidate_hostnames) - login_ip = local.login_private_ips[0] - login_hostname = local.login_hostnames[0] - # PAC High Availability - app_center_high_availability = var.app_center_high_availability - db_adminuser = var.enable_app_center && var.app_center_high_availability ? var.db_instance_info.admin_user : "" - db_adminpassword = var.enable_app_center && var.app_center_high_availability ? var.db_admin_password : "" - db_hostname = var.enable_app_center && var.app_center_high_availability ? var.db_instance_info.hostname : "" - db_port = var.enable_app_center && var.app_center_high_availability ? var.db_instance_info.port : "" - db_certificate = var.enable_app_center && var.app_center_high_availability ? var.db_instance_info.certificate : "" - db_name = var.enable_app_center && var.app_center_high_availability ? local.db_name : "" - db_user = var.enable_app_center && var.app_center_high_availability ? local.db_user : "" - db_password = var.enable_app_center && var.app_center_high_availability ? module.generate_db_password[0].password : "" - # Observability - observability_monitoring_enable = var.observability_monitoring_enable - observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable - cloud_monitoring_access_key = var.cloud_monitoring_access_key - cloud_monitoring_ingestion_url = var.cloud_monitoring_ingestion_url - cloud_monitoring_prws_key = var.cloud_monitoring_prws_key - cloud_monitoring_prws_url = var.cloud_monitoring_prws_url - cloud_logs_ingress_private_endpoint = var.cloud_logs_ingress_private_endpoint - observability_logs_enable_for_management = var.observability_logs_enable_for_management - observability_logs_enable_for_compute = var.observability_logs_enable_for_compute - solution = var.solution - rc_ncores = local.ncores - rc_ncpus = local.ncpus - rc_mem_in_mb = local.mem_in_mb - rc_profile = local.rc_profile + bastion_public_key_content = var.bastion_public_key_content != null ? var.bastion_public_key_content : "" + storage_public_key_content = local.enable_storage ? module.storage_key[0].public_key_content : "" + storage_private_key_content = local.enable_storage ? module.storage_key[0].private_key_content : "" } } diff --git a/modules/landing_zone_vsi/templates/afm_user_data.tpl b/modules/landing_zone_vsi/templates/afm_user_data.tpl new file mode 100644 index 00000000..c2f936af --- /dev/null +++ b/modules/landing_zone_vsi/templates/afm_user_data.tpl @@ -0,0 +1,110 @@ +#!/usr/bin/bash + +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/usr/bin/env bash +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys + +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${storage_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${storage_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +# if grep -q "Red Hat" /etc/os-release +if grep -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + else + PACKAGE_MGR=yum + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" + fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +yum update --security -y +yum versionlock $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + +echo "DOMAIN=${storage_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 +systemctl restart NetworkManager + +systemctl stop firewalld +firewall-offline-cmd --zone=public --add-port=1191/tcp +firewall-offline-cmd --zone=public --add-port=4444/tcp +firewall-offline-cmd --zone=public --add-port=4444/udp +firewall-offline-cmd --zone=public --add-port=4739/udp +firewall-offline-cmd --zone=public --add-port=4739/tcp +firewall-offline-cmd --zone=public --add-port=9084/tcp +firewall-offline-cmd --zone=public --add-port=9085/tcp +firewall-offline-cmd --zone=public --add-service=http +firewall-offline-cmd --zone=public --add-service=https +firewall-offline-cmd --zone=public --add-port=2049/tcp +firewall-offline-cmd --zone=public --add-port=2049/udp +firewall-offline-cmd --zone=public --add-port=111/tcp +firewall-offline-cmd --zone=public --add-port=111/udp +firewall-offline-cmd --zone=public --add-port=30000-61000/tcp +firewall-offline-cmd --zone=public --add-port=30000-61000/udp +systemctl start firewalld +systemctl enable firewalld diff --git a/modules/landing_zone_vsi/templates/client_user_data.tpl b/modules/landing_zone_vsi/templates/client_user_data.tpl new file mode 100644 index 00000000..24abf3d3 --- /dev/null +++ b/modules/landing_zone_vsi/templates/client_user_data.tpl @@ -0,0 +1,90 @@ +#!/usr/bin/bash + +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/usr/bin/env bash + +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +if grep -q "Red Hat" /etc/os-release +then + USER=vpcuser + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + else + PACKAGE_MGR=yum + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) rsync firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" + fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +yum update --security -y +yum versionlock add $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 10; exit 142\" /" ~/.ssh/authorized_keys +echo "${bastion_public_key_content}" >> /~/.ssh/authorized_keys +echo "${client_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${client_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +echo "DOMAIN=${client_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${client_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${client_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 +systemctl restart NetworkManager diff --git a/modules/landing_zone_vsi/templates/compute_user_data.tpl b/modules/landing_zone_vsi/templates/compute_user_data.tpl deleted file mode 100644 index 94b6d59a..00000000 --- a/modules/landing_zone_vsi/templates/compute_user_data.tpl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/bash - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -#!/usr/bin/env bash -if grep -E -q "CentOS|Red Hat" /etc/os-release -then - USER=vpcuser -elif grep -q "Ubuntu" /etc/os-release -then - USER=ubuntu -fi -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys - -# input parameters -echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys -echo "${compute_public_key_content}" >> ~/.ssh/authorized_keys -echo "StrictHostKeyChecking no" >> ~/.ssh/config -echo "${compute_private_key_content}" > ~/.ssh/id_rsa -chmod 600 ~/.ssh/id_rsa - -# network setup -echo "DOMAIN=${compute_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" -echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" -chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser -systemctl restart NetworkManager diff --git a/modules/landing_zone_vsi/templates/gklm_user_data.tpl b/modules/landing_zone_vsi/templates/gklm_user_data.tpl new file mode 100644 index 00000000..cb14c0eb --- /dev/null +++ b/modules/landing_zone_vsi/templates/gklm_user_data.tpl @@ -0,0 +1,17 @@ +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/bin/bash +echo "0 $(hostname) 0" > /home/klmdb42/sqllib/db2nodes.cfg +systemctl start db2c_klmdb42.service +sleep 10 +systemctl status db2c_klmdb42.service +sleep 10 +#Copying SSH for passwordless authentication +echo "${storage_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +reboot diff --git a/modules/landing_zone_vsi/templates/ldap_user_data.sh b/modules/landing_zone_vsi/templates/ldap_user_data.sh deleted file mode 100644 index 4a1f87f8..00000000 --- a/modules/landing_zone_vsi/templates/ldap_user_data.sh +++ /dev/null @@ -1,301 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -USER=ubuntu -BASE_DN="${ldap_basedns}" -LDAP_DIR="/opt" -LDAP_ADMIN_PASSWORD="${ldap_admin_password}" -LDAP_GROUP="${cluster_prefix}" -LDAP_USER="${ldap_user}" -LDAP_USER_PASSWORD="${ldap_user_password}" -nfs_server_with_mount_path=${mount_path} -logfile="/tmp/user_data.log" - -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys - -#input parameters -ssh_public_key_content="${ssh_public_key_content}" -echo "${ssh_public_key_content}" >> home/$USER/.ssh/authorized_keys -echo "StrictHostKeyChecking no" >> /home/$USER/.ssh/config - -# Installing Required softwares -apt-get update -y -apt-get install gnutls-bin ssl-cert nfs-common -y - -# Setup Network configuration -# Change the MTU setting as this is required for setting mtu as 9000 for communication to happen between clusters -if grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - net_int=$(basename /sys/class/net/en*) - netplan_config="/etc/netplan/50-cloud-init.yaml" - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) - usermod -s /bin/bash ubuntu - # Replace the MTU value in the Netplan configuration - if ! grep -qE "^[[:space:]]*mtu: 9000" $netplan_config; then - echo "MTU 9000 Packages entries not found" - # Append the MTU configuration to the Netplan file - sudo sed -i '/'"$net_int"':/a\ mtu: 9000' $netplan_config - sudo sed -i '/dhcp4: true/a \ nameservers:\n search: ['"$dns_domain"']' $netplan_config - sudo sed -i '/'"$net_int"':/a\ routes:\n - to: '"$cidr_range"'\n via: '"$gateway_ip"'\n metric: 100\n mtu: 9000' $netplan_config - sudo netplan apply - echo "MTU set to 9000 on Netplan." >> $logfile - else - echo "MTU entry already exists in Netplan. Skipping." >> $logfile - fi -fi - -echo "Initiating LSF share mount" >> $logfile -# Function to attempt NFS mount with retries -mount_nfs_with_retries() { - local server_path=$1 - local client_path=$2 - local retries=5 - local success=false - - rm -rf "${client_path}" - mkdir -p "${client_path}" - - for (( j=0; j> $logfile - if mount | grep -q "${client_path}"; then - echo "Mount successful for ${server_path} on ${client_path}" >> $logfile - success=true - break - else - echo "Attempt $((j+1)) of $retries failed for ${server_path} on ${client_path}" >> $logfile - sleep 2 - fi - done - - if [ "$success" = true ]; then - return 0 - else - return 1 - fi -} - -# Setup LSF share -if [ -n "${nfs_server_with_mount_path}" ]; then - echo "File share ${nfs_server_with_mount_path} found" >> $logfile - nfs_client_mount_path="/mnt/lsf" - if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then - mkdir -p "$nfs_client_mount_path/openldap" - else - echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" >> $logfile - exit 1 - fi -else - echo "No NFS server mount path provided, Exiting !!" >> $logfile - exit 1 -fi -echo "Setting LSF share is completed." >> $logfile - -#Installing LDAP -export DEBIAN_FRONTEND='non-interactive' -echo -e "slapd slapd/root_password password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -echo -e "slapd slapd/root_password_again password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -apt-get install -y slapd ldap-utils - -echo -e "slapd slapd/internal/adminpw password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -echo -e "slapd slapd/internal/generated_adminpw password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -echo -e "slapd slapd/password2 password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -echo -e "slapd slapd/password1 password ${LDAP_ADMIN_PASSWORD}" |debconf-set-selections -echo -e "slapd slapd/domain string ${BASE_DN}" |debconf-set-selections -echo -e "slapd shared/organization string ${BASE_DN}" |debconf-set-selections -echo -e "slapd slapd/purge_database boolean false" |debconf-set-selections -echo -e "slapd slapd/move_old_database boolean true" |debconf-set-selections -echo -e "slapd slapd/no_configuration boolean false" |debconf-set-selections -dpkg-reconfigure slapd -echo "BASE dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" >> /etc/ldap/ldap.conf -echo "URI ldap://localhost" >> /etc/ldap/ldap.conf -systemctl restart slapd -systemctl enable slapd -echo "LDAP server installtion completed" >> $logfile - -# Generate SSL cert and Configure with OpenLDAP server -certtool --generate-privkey --sec-param High --outfile /etc/ssl/private/ldap_cakey.pem - -# Create CA template file -cat < /etc/ssl/ca.info -cn = ${LDAP_GROUP} -ca -cert_signing_key -expiration_days = 3650 -EOF - -# Generate a self-signed CA certificate -certtool --generate-self-signed \ ---load-privkey /etc/ssl/private/ldap_cakey.pem \ ---template /etc/ssl/ca.info \ ---outfile /usr/local/share/ca-certificates/ldap_cacert.pem - -# Update CA certificates and copy the generated CA certificate to /etc/ssl/certs/ -update-ca-certificates -cp -r /usr/local/share/ca-certificates/ldap_cacert.pem /etc/ssl/certs/ -cp -r /usr/local/share/ca-certificates/ldap_cacert.pem "$nfs_client_mount_path/openldap" -chmod -R 777 "$nfs_client_mount_path/openldap" - -# Generate a private key for the LDAP server -certtool --generate-privkey --sec-param High --outfile /etc/ssl/private/ldapserver_slapd_key.pem - -# Create LDAP server certificate template -cat < /etc/ssl/ldapserver.info -organization = ${LDAP_GROUP} -cn = localhost -tls_www_server -encryption_key -signing_key -expiration_days = 3650 -EOF - -# Generate a certificate for the LDAP server signed by the CA -certtool --generate-certificate \ ---load-privkey /etc/ssl/private/ldapserver_slapd_key.pem \ ---load-ca-certificate /etc/ssl/certs/ldap_cacert.pem \ ---load-ca-privkey /etc/ssl/private/ldap_cakey.pem \ ---template /etc/ssl/ldapserver.info \ ---outfile /etc/ssl/certs/ldapserver_slapd_cert.pem - -# Set proper permissions for the LDAP server private key -chgrp openldap /etc/ssl/private/ldapserver_slapd_key.pem -chmod 0640 /etc/ssl/private/ldapserver_slapd_key.pem -gpasswd -a openldap ssl-cert - -sleep 2 - -# Restart slapd service to apply changes -systemctl restart slapd.service - -# Create LDIF file for configuring TLS in LDAP -cat < /etc/ssl/certinfo.ldif -dn: cn=config -add: olcTLSCACertificateFile -olcTLSCACertificateFile: /etc/ssl/certs/ldap_cacert.pem -- -add: olcTLSCertificateFile -olcTLSCertificateFile: /etc/ssl/certs/ldapserver_slapd_cert.pem -- -add: olcTLSCertificateKeyFile -olcTLSCertificateKeyFile: /etc/ssl/private/ldapserver_slapd_key.pem -EOF - -# Apply TLS configuration using ldapmodify -ldapmodify -Y EXTERNAL -H ldapi:/// -f /etc/ssl/certinfo.ldif - -# Update slapd service to listen on ldaps:// as well -sed -i 's\SLAPD_SERVICES="ldap:/// ldapi:///"\SLAPD_SERVICES="ldap:/// ldapi:/// ldaps:///"\g' /etc/default/slapd - -sleep 2 - -# Update /etc/ldap/ldap.conf -cat < /etc/ldap/ldap.conf -BASE dc=${BASE_DN%%.*},dc=${BASE_DN#*.} -URI ldap://localhost -TLS_CACERT /etc/ssl/certs/ldap_cacert.pem -TLS_REQCERT allow -EOF - -# Restart slapd service to apply changes -systemctl restart slapd.service -echo "SSL creation complted" >> $logfile - -#LDAP Operations -check_and_create_ldap_ou() { - local ou_name="$1" - local ldif_file="${LDAP_DIR}/ou${ou_name}.ldif" - local search_result="" - - echo "dn: ou=${ou_name},dc=${BASE_DN%%.*},dc=${BASE_DN#*.} -objectClass: organizationalUnit -ou: ${ou_name}" > "${ldif_file}" - - ldapsearch -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -b "ou=${ou_name},dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" "objectClass=organizationalUnit" > /dev/null 2>&1 - search_result=$? - - [ ${search_result} -eq 32 ] && echo "${ou_name}OUNotFound" || echo "${ou_name}OUFound" -} - -# LDAP | Server People OU Check and Create -ldap_people_ou_search=$(check_and_create_ldap_ou People) -[ "${ldap_people_ou_search}" == "PeopleOUNotFound" ] && ldapadd -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -f "${LDAP_DIR}/ouPeople.ldif" -[ "${ldap_people_ou_search}" == "PeopleOUFound" ] && echo "LDAP OU 'People' already exists. Skipping." - -# LDAP | Server Groups OU Check and Create -ldap_groups_ou_search=$(check_and_create_ldap_ou Groups) -[ "${ldap_groups_ou_search}" == "GroupsOUNotFound" ] && ldapadd -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -f "${LDAP_DIR}/ouGroups.ldif" -[ "${ldap_groups_ou_search}" == "GroupsOUFound" ] && echo "LDAP OU 'Groups' already exists. Skipping." - -# Creating LDAP Group on the LDAP Server - -# LDAP | Group File -echo "dn: cn=${LDAP_GROUP},ou=Groups,dc=${BASE_DN%%.*},dc=${BASE_DN#*.} -objectClass: posixGroup -cn: ${LDAP_GROUP} -gidNumber: 5000" > "${LDAP_DIR}/group.ldif" - -# LDAP Group Search -ldap_group_dn="cn=${LDAP_GROUP},ou=Groups,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -ldap_group_search_result=$(ldapsearch -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -b "${ldap_group_dn}" "(cn=${LDAP_GROUP})" 2>&1) - -# Check if LDAP Group exists -if echo "${ldap_group_search_result}" | grep -q "dn: ${ldap_group_dn}," -then - echo "LDAP Group '${LDAP_GROUP}' already exists. Skipping." >> $logfile - ldap_group_search="GroupFound" -else - echo "LDAP Group '${LDAP_GROUP}' not found. Creating..." >> $logfile - ldapadd -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -f "${LDAP_DIR}/group.ldif" - ldap_group_search="GroupNotFound" -fi - -# Creating LDAP User on the LDAP Server - -# Generate LDAP Password Hash -ldap_hashed_password=$(slappasswd -s "${LDAP_USER_PASSWORD}") - -# LDAP | User File -echo "dn: uid=${LDAP_USER},ou=People,dc=${BASE_DN%%.*},dc=${BASE_DN#*.} -objectClass: inetOrgPerson -objectClass: posixAccount -objectClass: shadowAccount -uid: ${LDAP_USER} -sn: ${LDAP_USER} -givenName: ${LDAP_USER} -cn: ${LDAP_USER} -displayName: ${LDAP_USER} -uidNumber: 10000 -gidNumber: 5000 -userPassword: ${ldap_hashed_password} -gecos: ${LDAP_USER} -loginShell: /bin/bash -homeDirectory: /home/${LDAP_USER}" > "${LDAP_DIR}/users.ldif" - -# LDAP User Search -ldap_user_dn="uid=${LDAP_USER},ou=People,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -ldap_user_search_result=$(ldapsearch -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -b "${ldap_user_dn}" uid cn 2>&1) - -# Check if LDAP User exists -if echo "${ldap_user_search_result}" | grep -q "dn: ${ldap_user_dn}," -then - echo "LDAP User '${LDAP_USER}' already exists. Skipping." >> $logfile - ldap_user_search="UserFound" -else - echo "LDAP User '${LDAP_USER}' not found. Creating..." >> $logfile - ldapadd -x -D "cn=admin,dc=${BASE_DN%%.*},dc=${BASE_DN#*.}" -w "${LDAP_ADMIN_PASSWORD}" -f "${LDAP_DIR}/users.ldif" - ldap_user_search="UserNotFound" -fi -echo "User and Group creation complted" >> $logfile - -# Attempt to unmount the VPC share -if umount -l "${nfs_client_mount_path}"; -then - echo "Unmounted ${nfs_client_mount_path} successfully." >> $logfile -else - echo "Failed to unmount ${nfs_client_mount_path}." >> $logfile - exit 1 -fi diff --git a/modules/landing_zone_vsi/templates/ldap_user_data.tpl b/modules/landing_zone_vsi/templates/ldap_user_data.tpl index 95333fa1..1ffc145f 100644 --- a/modules/landing_zone_vsi/templates/ldap_user_data.tpl +++ b/modules/landing_zone_vsi/templates/ldap_user_data.tpl @@ -5,18 +5,20 @@ # Licensed under the Apache License v2.0 ################################################### -logfile=/tmp/user_data.log -echo "Export LDAP user data (variable values)" -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile +#!/usr/bin/env bash +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys -%EXPORT_USER_DATA% #input parameters -ldap_basedns="${ldap_basedns}" -ldap_admin_password="${ldap_admin_password}" -cluster_prefix="${cluster_prefix}" -ldap_user="${ldap_user}" -ldap_user_password="${ldap_user_password}" -dns_domain="${dns_domain}" -mount_path="${mount_path}" - -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${compute_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${compute_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa diff --git a/modules/landing_zone_vsi/templates/login_user_data.tpl b/modules/landing_zone_vsi/templates/login_user_data.tpl index b928bb0b..b744f037 100644 --- a/modules/landing_zone_vsi/templates/login_user_data.tpl +++ b/modules/landing_zone_vsi/templates/login_user_data.tpl @@ -1,28 +1,62 @@ #!/usr/bin/bash + ################################################### # Copyright (C) IBM Corp. 2023 All Rights Reserved. # Licensed under the Apache License v2.0 ################################################### -logfile=/tmp/user_data.log -echo "Export user data (variable values)" -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile - -%EXPORT_USER_DATA% -#input parameters -network_interface=${network_interface} -dns_domain="${dns_domain}" -cluster_private_key_content="${cluster_private_key_content}" -cluster_public_key_content="${cluster_public_key_content}" -mount_path="${mount_path}" -custom_mount_paths="${custom_mount_paths}" -custom_file_shares="${custom_file_shares}" -enable_ldap="${enable_ldap}" -network_interface=""${network_interface}"" -rc_cidr_block="${rc_cidr_block}" -rc_cidr_block_1="${rc_cidr_block_1}" -cluster_prefix="${cluster_prefix}" -ldap_server_ip="${ldap_server_ip}" -ldap_basedns="${ldap_basedns}" -hyperthreading="${hyperthreading}" -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile +#!/usr/bin/env bash +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys + +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${login_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${login_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +# Network Configuration +RESOLV_CONF="/etc/resolv.conf" +BACKUP_FILE="/etc/resolv.conf.bkp" + +# Optional: backup the interface config +echo "DOMAIN=${login_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${login_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${login_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +systemctl restart NetworkManager + +make_editable() { + if lsattr "$RESOLV_CONF" 2>/dev/null | grep -q 'i'; then + chattr -i "$RESOLV_CONF" + fi +} + +make_immutable() { + chattr +i "$RESOLV_CONF" +} + +# Backup if not already +if [ ! -f "$BACKUP_FILE" ]; then + cp "$RESOLV_CONF" "$BACKUP_FILE" + echo "Backup created at $BACKUP_FILE" +fi + +make_editable + +# Modify or insert 'search' domain +if grep -q '^search ' "$RESOLV_CONF"; then + sed -i "s/^search .*/search ${login_dns_domain}/" "$RESOLV_CONF" +else + echo "search ${login_dns_domain}" >> "$RESOLV_CONF" +fi + +make_immutable +echo "Updated $RESOLV_CONF with search domain '${login_dns_domain}' and locked file." diff --git a/modules/landing_zone_vsi/templates/login_vsi.sh b/modules/landing_zone_vsi/templates/login_vsi.sh deleted file mode 100644 index 07580eef..00000000 --- a/modules/landing_zone_vsi/templates/login_vsi.sh +++ /dev/null @@ -1,403 +0,0 @@ -#!/bin/sh -# shellcheck disable=all - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -#variables - -logfile="/tmp/user_data.log" - -LSF_TOP="/opt/ibm/lsf" -LSF_CONF=$LSF_TOP/conf -LSF_HOSTS_FILE="/etc/hosts" - -nfs_server_with_mount_path=${mount_path} -custom_mount_paths="${custom_mount_paths}" -custom_file_shares="${custom_file_shares}" - -# Setup logs for user data -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile - -# Disallow root login -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"lsfadmin or vpcuser\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys - -# echo "DOMAIN=\"$dns_domain\"" >> "/etc/sysconfig/network-scripts/ifcfg-eth0" -echo "DOMAIN=\"$dns_domain\"" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - - -# Setup lsfadmin user -# Updates the lsfadmin user as never expire -chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin -# Setup ssh -lsfadmin_home_dir="/home/lsfadmin" -lsfadmin_ssh_dir="${lsfadmin_home_dir}/.ssh" -mkdir -p ${lsfadmin_ssh_dir} - -# Change for RHEL / Ubuntu compute image. -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - cp /home/ubuntu/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" - sudo cp /home/ubuntu/.profile "{$lsfadmin_home_dir}" -else - echo "Provided OS distribution not match, provide either RHEL or Ubuntu" >> $logfile -fi - -echo "${cluster_public_key_content}" >> "${lsfadmin_ssh_dir}/authorized_keys" -echo "${cluster_private_key_content}" >> "${lsfadmin_ssh_dir}/id_rsa" -echo "StrictHostKeyChecking no" >> "${lsfadmin_ssh_dir}/config" -chmod 600 "${lsfadmin_ssh_dir}/authorized_keys" -chmod 600 "${lsfadmin_ssh_dir}/id_rsa" -chmod 700 ${lsfadmin_ssh_dir} -chown -R lsfadmin:lsfadmin ${lsfadmin_ssh_dir} -echo "SSH key setup for lsfadmin user is completed" >> $logfile - -# Setup Network configuration -# Change the MTU setting as this is required for setting mtu as 9000 for communication to happen between clusters -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - # Replace the MTU value in the Netplan configuration - echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - echo "DOMAIN=\"${dns_domain}\"" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - # Change the MTU setting as 9000 at router level. - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - echo "${rc_cidr_block} via $gateway_ip dev ${network_interface} metric 0 mtu 9000" >> /etc/sysconfig/network-scripts/route-eth0 - systemctl restart NetworkManager -elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - net_int=$(basename /sys/class/net/en*) - netplan_config="/etc/netplan/50-cloud-init.yaml" - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) - usermod -s /bin/bash lsfadmin - # Replace the MTU value in the Netplan configuration - if ! grep -qE "^[[:space:]]*mtu: 9000" $netplan_config; then - echo "MTU 9000 Packages entries not found" - # Append the MTU configuration to the Netplan file - sudo sed -i '/'"$net_int"':/a\ mtu: 9000' $netplan_config - sudo sed -i '/dhcp4: true/a \ nameservers:\n search: ['"$dns_domain"']' $netplan_config - sudo sed -i '/'"$net_int"':/a\ routes:\n - to: '"$cidr_range"'\n via: '"$gateway_ip"'\n metric: 100\n mtu: 9000' $netplan_config - sudo netplan apply - echo "MTU set to 9000 on Netplan." - else - echo "MTU entry already exists in Netplan. Skipping." - fi -fi - -# Setup root user -root_ssh_dir="/root/.ssh" -echo "${cluster_public_key_content}" >> $root_ssh_dir/authorized_keys -echo "StrictHostKeyChecking no" >> $root_ssh_dir/config -echo "cluster ssh key has been added to root user" >> $logfile - -echo "$hyperthreading" -if [ "$hyperthreading" == true ]; then - ego_define_ncpus="threads" -else - ego_define_ncpus="cores" - cat << 'EOT' > /root/lsf_hyperthreading -#!/bin/sh -for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do - echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" -done -EOT - chmod 755 /root/lsf_hyperthreading - command="/root/lsf_hyperthreading" - sh $command && (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - -fi - -echo "Initiating LSF share mount" >> $logfile -# Function to attempt NFS mount with retries -mount_nfs_with_retries() { - local server_path=$1 - local client_path=$2 - local retries=5 - local success=false - - rm -rf "${client_path}" - mkdir -p "${client_path}" - - for (( j=0; j> $logfile - if mount | grep -q "${client_path}"; then - echo "Mount successful for ${server_path} on ${client_path}" >> $logfile - success=true - break - else - echo "Attempt $((j+1)) of $retries failed for ${server_path} on ${client_path}" >> $logfile - sleep 2 - fi - done - - if [ "$success" = true ]; then - echo "${server_path} ${client_path} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" >> /etc/fstab - else - echo "Mount not found for ${server_path} on ${client_path} after $retries attempts." >> $logfile - rm -rf "${client_path}" - fi - - if [ "$success" = true ]; then - return 0 - else - return 1 - fi -} - -# Setup LSF share -if [ -n "${nfs_server_with_mount_path}" ]; then - echo "File share ${nfs_server_with_mount_path} found" >> $logfile - nfs_client_mount_path="/mnt/lsf" - rm -rf /opt/ibm/lsf/conf/ - rm -rf /opt/ibm/lsf/work/ - if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then - # Move stuff to shared fs - for dir in conf work; do - mv "${LSF_TOP}/$dir" "${nfs_client_mount_path}" - ln -fs "${nfs_client_mount_path}/$dir" "${LSF_TOP}" - done - chown -R lsfadmin:root "${LSF_TOP}" - else - echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" >> $logfile - exit 1 - fi -else - echo "No NFS server mount path provided, Exiting !!" >> $logfile - exit 1 -fi -echo "Setting LSF share is completed." >> $logfile - -# Setup Custom file shares -echo "Setting custom file shares." >> $logfile -if [ -n "${custom_file_shares}" ]; then - echo "Custom file share ${custom_file_shares} found" >> $logfile - file_share_array=(${custom_file_shares}) - mount_path_array=(${custom_mount_paths}) - length=${#file_share_array[@]} - - for (( i=0; i> $logfile - -echo "source ${LSF_CONF}/profile.lsf" >> "${lsfadmin_home_dir}"/.bashrc -echo "source ${LSF_CONF}/profile.lsf" >> /root/.bashrc -echo "profile setup copy complete" >> $logfile - -# Pause execution for 30 seconds -sleep 30 - -# Display the contents of /etc/resolv.conf before changes -echo "Contents of /etc/resolv.conf before changes:" -cat /etc/resolv.conf - -# Display the updated contents of /etc/resolv.conf -echo "Contents of /etc/resolv.conf after changes:" >> $logfile -cat /etc/resolv.conf -#python3 -c "import ipaddress; print('\n'.join([str(ip) + ' ${cluster_prefix}-' + str(ip).replace('.', '-') for ip in ipaddress.IPv4Network('${rc_cidr_block_1}')]) + '\n' + '\n'.join([str(ip) + ' ${cluster_prefix}-' + str(ip).replace('.', '-') for ip in ipaddress.IPv4Network('${rc_cidr_block_2}')]))" >> "$LSF_HOSTS_FILE" - -#Hostname resolution - login node to management nodes -echo "Pausing for 300 seconds to configure hostname name resolution..." >> $logfile -sleep 300 -ls /mnt/lsf -ls -ltr /mnt/lsf -cp /mnt/lsf/conf/hosts /etc/hosts - -# Ldap Configuration: -enable_ldap="${enable_ldap}" -ldap_server_ip="${ldap_server_ip}" -base_dn="${ldap_basedns}" - -# Setting up the LDAP configuration -if [ "$enable_ldap" = "true" ]; then - - # Detect if the operating system is RHEL or Rocky Linux - if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release || grep -q "NAME=\"Rocky Linux\"" /etc/os-release; then - - # Extract and store the major version of the operating system (8 or 9) - version=$(grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}') - - # Proceed if the detected version is either 8 or 9 - if [ "$version" == "8" ] || [ "$version" == "9" ]; then - echo "Detected as RHEL or Rocky $version. Proceeding with LDAP client configuration..." >> $logfile - - # Enable password authentication for SSH by modifying the configuration file - sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - systemctl restart sshd - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 100 attempts and 5 seconds sleep between retries - for attempt in {1..100}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - mkdir -p /etc/openldap/certs - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/openldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 100 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 100 attempts. Exiting." >> $logfile; exit 1; } - - # Create and configure the SSSD configuration file for LDAP integration - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/openldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - chmod 600 /etc/sssd/sssd.conf - chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS - cat < /etc/openldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/openldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/openldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/openldap/certs - - # Apply the SSSD and home directory creation configuration using authselect - authselect select sssd with-mkhomedir --force - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - systemctl enable --now sssd oddjobd - - # Restart both services to apply the configuration - echo "Restarting OpenLDAP SSSD service." >> $logfile - systemctl restart sssd oddjobd - - - # Validate the LDAP configuration by performing a test search using ldapsearch - if ldapsearch -x -H ldap://"${ldap_server_ip}"/ -b "dc=${base_dn%%.*},dc=${base_dn#*.}" > /dev/null; then - echo "LDAP configuration completed successfully!" >> $logfile - else - echo "LDAP configuration failed! Exiting." >> $logfile - exit 1 - fi - - # Ensure LSF commands are available to all users by adding the profile to bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bashrc - source /etc/bashrc - - else - echo "This script is intended for RHEL and Rocky Linux 8 or 9. Detected version: $version. Exiting." >> $logfile - exit 1 - fi - - # Detect if the operating system is Ubuntu - elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - - echo "Detected as Ubuntu. Proceeding with LDAP client configuration..." >> $logfile - - # Allow password authentication for SSH in two configuration files, then restart the SSH service - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config.d/50-cloudimg-settings.conf - sudo systemctl restart ssh - - # Add configuration for automatic home directory creation to the PAM session configuration file - sudo sed -i '$ i\session required pam_mkhomedir.so skel=/etc/skel umask=0022\' /etc/pam.d/common-session - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 100 attempts and 5 seconds sleep between retries - for attempt in {1..100}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - mkdir -p /etc/ldap/certs - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/ldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 100 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 100 attempts. Exiting." >> $logfile; exit 1; } - - # Create and configure the SSSD configuration file for LDAP integration on Ubuntu - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/ldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - sudo chmod 600 /etc/sssd/sssd.conf - sudo chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS on Ubuntu - cat < /etc/ldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/ldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/ldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/ldap/certs - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - echo "Restarting OpenLDAP SSSD service." >> $logfile - sudo systemctl enable --now sssd oddjobd && sudo systemctl restart sssd oddjobd - - # Ensure LSF commands are available to all users by adding the profile to bash.bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bash.bashrc - source /etc/bash.bashrc - - # Validate the LDAP configuration by checking the status of the SSSD service - if sudo systemctl is-active --quiet sssd; then - echo "LDAP client configuration completed successfully!" >> $logfile - else - echo "LDAP client configuration failed! Exiting." >> $logfile - exit 1 - fi - - else - echo "This script is designed for RHEL, Rocky Linux, or Ubuntu. Unsupported OS detected. Exiting." >> $logfile - exit 1 - fi -fi diff --git a/modules/landing_zone_vsi/templates/lsf_compute_user_data.tpl b/modules/landing_zone_vsi/templates/lsf_compute_user_data.tpl new file mode 100644 index 00000000..b8f280a2 --- /dev/null +++ b/modules/landing_zone_vsi/templates/lsf_compute_user_data.tpl @@ -0,0 +1,62 @@ +#!/usr/bin/bash + +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +#!/usr/bin/env bash +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys + +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${management_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${management_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +# Network Configuration +RESOLV_CONF="/etc/resolv.conf" +BACKUP_FILE="/etc/resolv.conf.bkp" + +# Optional: backup the interface config +echo "DOMAIN=${management_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${management_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${management_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +systemctl restart NetworkManager + +make_editable() { + if lsattr "$RESOLV_CONF" 2>/dev/null | grep -q 'i'; then + chattr -i "$RESOLV_CONF" + fi +} + +make_immutable() { + chattr +i "$RESOLV_CONF" +} + +# Backup if not already +if [ ! -f "$BACKUP_FILE" ]; then + cp "$RESOLV_CONF" "$BACKUP_FILE" + echo "Backup created at $BACKUP_FILE" +fi + +make_editable + +# Modify or insert 'search' domain +if grep -q '^search ' "$RESOLV_CONF"; then + sed -i "s/^search .*/search ${management_dns_domain}/" "$RESOLV_CONF" +else + echo "search ${management_dns_domain}" >> "$RESOLV_CONF" +fi + +make_immutable +echo "Updated $RESOLV_CONF with search domain '${management_dns_domain}' and locked file." diff --git a/modules/landing_zone_vsi/templates/lsf_management.sh b/modules/landing_zone_vsi/templates/lsf_management.sh deleted file mode 100644 index 40b072b7..00000000 --- a/modules/landing_zone_vsi/templates/lsf_management.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# shellcheck disable=all - -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -# Local variable declaration -logfile="/tmp/user_data.log" - -# Setup logs for user data -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile - -echo "umask=$(umask)" >> $logfile - -# Disallow root login -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"lsfadmin or vpcuser\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys - -# Setup Network configuration -# Change the MTU setting as this is required for setting mtu as 9000 for communication to happen between clusters -echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" -echo "DOMAIN=\"$dns_domain\"" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - -# Change the MTU setting as 9000 at router level. -gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) -echo "${rc_cidr_block} via $gateway_ip dev ${network_interface} metric 0 mtu 9000" >> /etc/sysconfig/network-scripts/route-"${network_interface}" - -systemctl restart NetworkManager - -echo 1 > /proc/sys/vm/overcommit_memory # tt reports many failures of memory allocation at fork(). why? -{ - echo 'vm.overcommit_memory=1' - echo 'net.core.rmem_max=26214400' - echo 'net.core.rmem_default=26214400' - echo 'net.core.wmem_max=26214400' - echo 'net.core.wmem_default=26214400' - echo 'net.ipv4.tcp_fin_timeout = 5' - echo 'net.core.somaxconn = 8000' -} > /etc/sysctl.conf -sysctl -p /etc/sysctl.conf - -if [ ! "$hyperthreading" == true ]; then - for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do - echo 0 > /sys/devices/system/cpu/cpu"$vcpu"/online - done -fi - -# Setup lsfadmin user -# Updates the lsfadmin user as never expire -chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin -# Setup ssh -lsfadmin_home_dir="/home/lsfadmin" -lsfadmin_ssh_dir="${lsfadmin_home_dir}/.ssh" -mkdir -p ${lsfadmin_ssh_dir} -cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -echo "${cluster_public_key_content}" >> "${lsfadmin_ssh_dir}/authorized_keys" -echo "${cluster_private_key_content}" >> "${lsfadmin_ssh_dir}/id_rsa" -echo "StrictHostKeyChecking no" >> "${lsfadmin_ssh_dir}/config" -chmod 600 "${lsfadmin_ssh_dir}/authorized_keys" -chmod 600 "${lsfadmin_ssh_dir}/id_rsa" -chmod 700 ${lsfadmin_ssh_dir} -chown -R lsfadmin:lsfadmin ${lsfadmin_ssh_dir} -echo "SSH key setup for lsfadmin user is completed" >> $logfile - -# Setup root user -root_ssh_dir="/root/.ssh" -echo "${cluster_public_key_content}" >> $root_ssh_dir/authorized_keys -echo "StrictHostKeyChecking no" >> $root_ssh_dir/config -echo "cluster ssh key has been added to root user" >> $logfile diff --git a/modules/landing_zone_vsi/templates/management_user_data.tpl b/modules/landing_zone_vsi/templates/management_user_data.tpl index 3c441833..b8f280a2 100644 --- a/modules/landing_zone_vsi/templates/management_user_data.tpl +++ b/modules/landing_zone_vsi/templates/management_user_data.tpl @@ -5,18 +5,58 @@ # Licensed under the Apache License v2.0 ################################################### -logfile=/tmp/user_data.log -echo "Export user data (variable values)" -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile +#!/usr/bin/env bash +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi -### EXPORT_USER_DATA ### +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys -#input parameters -rc_cidr_block="${rc_cidr_block}" -cluster_private_key_content="${cluster_private_key_content}" -cluster_public_key_content="${cluster_public_key_content}" -hyperthreading="${hyperthreading}" -network_interface=${network_interface} -dns_domain="${dns_domain}" +# input parameters +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${management_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${management_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile +# Network Configuration +RESOLV_CONF="/etc/resolv.conf" +BACKUP_FILE="/etc/resolv.conf.bkp" + +# Optional: backup the interface config +echo "DOMAIN=${management_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${management_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${management_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +systemctl restart NetworkManager + +make_editable() { + if lsattr "$RESOLV_CONF" 2>/dev/null | grep -q 'i'; then + chattr -i "$RESOLV_CONF" + fi +} + +make_immutable() { + chattr +i "$RESOLV_CONF" +} + +# Backup if not already +if [ ! -f "$BACKUP_FILE" ]; then + cp "$RESOLV_CONF" "$BACKUP_FILE" + echo "Backup created at $BACKUP_FILE" +fi + +make_editable + +# Modify or insert 'search' domain +if grep -q '^search ' "$RESOLV_CONF"; then + sed -i "s/^search .*/search ${management_dns_domain}/" "$RESOLV_CONF" +else + echo "search ${management_dns_domain}" >> "$RESOLV_CONF" +fi + +make_immutable +echo "Updated $RESOLV_CONF with search domain '${management_dns_domain}' and locked file." diff --git a/modules/landing_zone_vsi/templates/protocol_user_data.tpl b/modules/landing_zone_vsi/templates/protocol_user_data.tpl index 4e5b622f..4fafedc1 100644 --- a/modules/landing_zone_vsi/templates/protocol_user_data.tpl +++ b/modules/landing_zone_vsi/templates/protocol_user_data.tpl @@ -6,6 +6,8 @@ ################################################### #!/usr/bin/env bash +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + if grep -E -q "CentOS|Red Hat" /etc/os-release then USER=vpcuser @@ -13,7 +15,7 @@ elif grep -q "Ubuntu" /etc/os-release then USER=ubuntu fi -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys # input parameters echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys @@ -22,27 +24,99 @@ echo "StrictHostKeyChecking no" >> ~/.ssh/config echo "${storage_private_key_content}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa -# scale pre-requisite installation -if grep -q "CentOS|Red Hat" /etc/os-release +if grep -q "Red Hat" /etc/os-release then USER=vpcuser - if grep -q "platform:el8" /etc/os-release + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release then PACKAGE_MGR=dnf - package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" else PACKAGE_MGR=yum - package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu fi +yum update --security -y +yum versionlock add $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + # network setup echo "DOMAIN=${storage_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 systemctl restart NetworkManager -# protocol network setup +systemctl stop firewalld +firewall-offline-cmd --zone=public --add-port=1191/tcp +firewall-offline-cmd --zone=public --add-port=4444/tcp +firewall-offline-cmd --zone=public --add-port=4444/udp +firewall-offline-cmd --zone=public --add-port=4739/udp +firewall-offline-cmd --zone=public --add-port=4739/tcp +firewall-offline-cmd --zone=public --add-port=9084/tcp +firewall-offline-cmd --zone=public --add-port=9085/tcp +firewall-offline-cmd --zone=public --add-service=http +firewall-offline-cmd --zone=public --add-service=https +firewall-offline-cmd --zone=public --add-port=2049/tcp +firewall-offline-cmd --zone=public --add-port=2049/udp +firewall-offline-cmd --zone=public --add-port=111/tcp +firewall-offline-cmd --zone=public --add-port=111/udp +firewall-offline-cmd --zone=public --add-port=30000-61000/tcp +firewall-offline-cmd --zone=public --add-port=30000-61000/udp +systemctl start firewalld +systemctl enable firewalld + +sec_interface=$(nmcli -t con show --active | grep eth1 | cut -d ':' -f 1) +nmcli conn del "$sec_interface" +nmcli con add type ethernet con-name eth1 ifname eth1 echo "DOMAIN=${protocol_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${protocol_interfaces}" -echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg"${protocol_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${protocol_interfaces}" systemctl restart NetworkManager + +###### TODO: Fix Me ###### +echo 'export IC_REGION=${vpc_region}' >> /root/.bashrc +echo 'export IC_SUBNET=${protocol_subnets}' >> /root/.bashrc +echo 'export IC_RG=${resource_group_id}' >> /root/.bashrc diff --git a/modules/landing_zone_vsi/templates/scale_compute_user_data.tpl b/modules/landing_zone_vsi/templates/scale_compute_user_data.tpl new file mode 100644 index 00000000..605ea6f8 --- /dev/null +++ b/modules/landing_zone_vsi/templates/scale_compute_user_data.tpl @@ -0,0 +1,111 @@ +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### + +################################################################################################################## +# Scale Compute Cluter User Data +################################################################################################################## + +#!/usr/bin/env bash + +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + +if grep -E -q "CentOS|Red Hat" /etc/os-release +then + USER=vpcuser +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 10; exit 142\" /" ~/.ssh/authorized_keys +echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys +echo "${compute_public_key_content}" >> ~/.ssh/authorized_keys +echo "StrictHostKeyChecking no" >> ~/.ssh/config +echo "${compute_private_key_content}" > ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa + +if grep -q "Red Hat" /etc/os-release +then + USER=vpcuser + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + else + PACKAGE_MGR=yum + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) rsync firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" + fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +yum update --security -y +yum versionlock add $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + +echo "DOMAIN=${compute_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" +echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${compute_interfaces}" +chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 +systemctl restart NetworkManager + +systemctl stop firewalld +firewall-offline-cmd --zone=public --add-port=1191/tcp +firewall-offline-cmd --zone=public --add-port=60000-61000/tcp +firewall-offline-cmd --zone=public --add-port=47080/tcp +firewall-offline-cmd --zone=public --add-port=47080/udp +firewall-offline-cmd --zone=public --add-port=47443/tcp +firewall-offline-cmd --zone=public --add-port=47443/udp +firewall-offline-cmd --zone=public --add-port=4444/tcp +firewall-offline-cmd --zone=public --add-port=4444/udp +firewall-offline-cmd --zone=public --add-port=4739/udp +firewall-offline-cmd --zone=public --add-port=4739/tcp +firewall-offline-cmd --zone=public --add-port=9084/tcp +firewall-offline-cmd --zone=public --add-port=9085/tcp +firewall-offline-cmd --zone=public --add-service=http +firewall-offline-cmd --zone=public --add-service=https + +systemctl start firewalld +systemctl enable firewalld diff --git a/modules/landing_zone_vsi/templates/static_worker_user_data.tpl b/modules/landing_zone_vsi/templates/static_worker_user_data.tpl deleted file mode 100644 index 4235386d..00000000 --- a/modules/landing_zone_vsi/templates/static_worker_user_data.tpl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/bash -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -logfile=/tmp/user_data.log -echo "Export user data (variable values)" -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile - -%EXPORT_USER_DATA% -#input parameters -dns_domain="${dns_domain}" -cluster_private_key_content="${cluster_private_key_content}" -cluster_public_key_content="${cluster_public_key_content}" -cluster_name="${cluster_name}" -mount_path="${mount_path}" -custom_mount_paths="${custom_mount_paths}" -custom_file_shares="${custom_file_shares}" -enable_ldap="${enable_ldap}" -network_interface="${network_interface}" -rc_cidr_block="${rc_cidr_block}" -cluster_prefix="${cluster_prefix}" -ldap_server_ip="${ldap_server_ip}" -ldap_basedns="${ldap_basedns}" -hyperthreading="${hyperthreading}" -management_hostname=${management_hostname} -observability_monitoring_enable="${observability_monitoring_enable}" -observability_monitoring_on_compute_nodes_enable="${observability_monitoring_on_compute_nodes_enable}" -cloud_monitoring_access_key="${cloud_monitoring_access_key}" -cloud_monitoring_ingestion_url="${cloud_monitoring_ingestion_url}" -cloud_logs_ingress_private_endpoint="${cloud_logs_ingress_private_endpoint}" -observability_logs_enable_for_compute="${observability_logs_enable_for_compute}" -VPC_APIKEY_VALUE="${VPC_APIKEY_VALUE}" -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> $logfile diff --git a/modules/landing_zone_vsi/templates/static_worker_vsi.sh b/modules/landing_zone_vsi/templates/static_worker_vsi.sh deleted file mode 100644 index 6dfa9ce2..00000000 --- a/modules/landing_zone_vsi/templates/static_worker_vsi.sh +++ /dev/null @@ -1,712 +0,0 @@ -#!/bin/sh -# shellcheck disable=all -################################################### -# Copyright (C) IBM Corp. 2021 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -logfile="/tmp/worker_vsi.log" -echo "START $(date '+%Y-%m-%d %H:%M:%S')" >> "$logfile" - -# Local variable declaration -nfs_server_with_mount_path=${mount_path} -enable_ldap="${enable_ldap}" -ldap_server_ip="${ldap_server_ip}" -base_dn="${ldap_basedns}" -cluster_name=${cluster_name} -HostIP=$(hostname -I | awk '{print $1}') -HostName=$(hostname) -#ManagementHostNames="" -#for (( i=1; i<=management_node_count; i++ )) -#do -# ManagementHostNames+=" ${cluster_prefix}-mgmt-$i" -#done - -mgmt_hostname_primary="$management_hostname" -mgmt_hostnames="${management_hostname},${management_cand_hostnames}" -mgmt_hostnames="${mgmt_hostnames//,/ }" # replace commas with spaces -mgmt_hostnames="${mgmt_hostnames# }" # remove an initial space -mgmt_hostnames="${mgmt_hostnames% }" # remove a final space - -# Setup Network configuration -# Change the MTU setting as this is required for setting mtu as 9000 for communication to happen between clusters -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - # Replace the MTU value in the Netplan configuration - echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - echo "DOMAIN=\"${dns_domain}\"" >> "/etc/sysconfig/network-scripts/ifcfg-${network_interface}" - # Change the MTU setting as 9000 at router level. - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - echo "${rc_cidr_block} via $gateway_ip dev ${network_interface} metric 0 mtu 9000" >> /etc/sysconfig/network-scripts/route-eth0 - systemctl restart NetworkManager -elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - net_int=$(basename /sys/class/net/en*) - netplan_config="/etc/netplan/50-cloud-init.yaml" - gateway_ip=$(ip route | grep default | awk '{print $3}' | head -n 1) - cidr_range=$(ip route show | grep "kernel" | awk '{print $1}' | head -n 1) - usermod -s /bin/bash lsfadmin - # Replace the MTU value in the Netplan configuration - if ! grep -qE "^[[:space:]]*mtu: 9000" $netplan_config; then - echo "MTU 9000 Packages entries not found" - # Append the MTU configuration to the Netplan file - sudo sed -i '/'$net_int':/a\ mtu: 9000' $netplan_config - sudo sed -i '/dhcp4: true/a \ nameservers:\n search: ['$dns_domain']' $netplan_config - sudo sed -i '/'$net_int':/a\ routes:\n - to: '$cidr_range'\n via: '$gateway_ip'\n metric: 100\n mtu: 9000' $netplan_config - sudo netplan apply - echo "MTU set to 9000 on Netplan." - else - echo "MTU entry already exists in Netplan. Skipping." - fi -fi - -# Setup VPC FileShare | NFS Mount -LSF_TOP="/opt/ibm/lsf" -echo "Initiating LSF share mount" >> $logfile -# Function to attempt NFS mount with retries -mount_nfs_with_retries() { - local server_path=$1 - local client_path=$2 - local retries=5 - local success=false - - rm -rf "${client_path}" - mkdir -p "${client_path}" - - for (( j=0; j> $logfile - if mount | grep -q "${client_path}"; then - echo "Mount successful for ${server_path} on ${client_path}" >> $logfile - success=true - break - else - echo "Attempt $((j+1)) of $retries failed for ${server_path} on ${client_path}" >> $logfile - sleep 2 - fi - done - - if [ "$success" = true ]; then - echo "${server_path} ${client_path} nfs rw,sec=sys,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,_netdev 0 0" >> /etc/fstab - else - echo "Mount not found for ${server_path} on ${client_path} after $retries attempts." >> $logfile - rm -rf "${client_path}" - fi - - if [ "$success" = true ]; then - return 0 - else - return 1 - fi -} - -# Setup LSF share -if [ -n "${nfs_server_with_mount_path}" ]; then - echo "File share ${nfs_server_with_mount_path} found" >> $logfile - nfs_client_mount_path="/mnt/lsf" - rm -rf /opt/ibm/lsf/conf/ - rm -rf /opt/ibm/lsf/work/ - if mount_nfs_with_retries "${nfs_server_with_mount_path}" "${nfs_client_mount_path}"; then - # Move stuff to shared fs - for dir in conf work; do - mv "${LSF_TOP}/$dir" "${nfs_client_mount_path}" - ln -fs "${nfs_client_mount_path}/$dir" "${LSF_TOP}/$dir" - done - chown -R lsfadmin:root "${LSF_TOP}" - else - echo "Mount not found for ${nfs_server_with_mount_path}, Exiting !!" >> $logfile - exit 1 - fi -else - echo "No NFS server mount path provided, Exiting !!" >> $logfile - exit 1 -fi -echo "Setting LSF share is completed." >> $logfile - -# Setup Custom file shares -echo "Setting custom file shares." >> $logfile -if [ -n "${custom_file_shares}" ]; then - echo "Custom file share ${custom_file_shares} found" >> $logfile - file_share_array=(${custom_file_shares}) - mount_path_array=(${custom_mount_paths}) - length=${#file_share_array[@]} - - for (( i=0; i> $logfile - -# Setup LSF environment variables -LSF_TOP="/opt/ibm/lsf_worker" -LSF_TOP_VERSION=10.1 -LSF_CONF="$LSF_TOP/conf" -LSF_CONF_FILE="$LSF_CONF/lsf.conf" -LSF_HOSTS_FILE="/opt/ibm/lsf/conf/hosts" -. "$LSF_CONF/profile.lsf" -echo "Logging env variables" >> "$logfile" -env | sort >> "$logfile" - -# Update lsf configuration -echo 'LSB_MC_DISABLE_HOST_LOOKUP=Y' >> $LSF_CONF_FILE -echo "LSF_RSH=\"ssh -o 'PasswordAuthentication no' -o 'StrictHostKeyChecking no'\"" >> $LSF_CONF_FILE -sed -i "s/LSF_SERVER_HOSTS=.*/LSF_SERVER_HOSTS=\"$mgmt_hostnames\"/g" $LSF_CONF_FILE - -# Update the entry to LSF_HOSTS_FILE -#sed -i "s/^$HostIP .*/$HostIP $HostName/g" /opt/ibm/lsf/conf/hosts -#if grep -q "^$HostIP" "$LSF_HOSTS_FILE"; then -# sed -i "s/^$HostIP .*/$HostIP $HostName/g" "$LSF_HOSTS_FILE" -#else -# echo "$HostIP $HostName" >> "$LSF_HOSTS_FILE" -#fi -#echo "$HostIP $HostName" >> "$LSF_HOSTS_FILE" - -MAX_RETRIES=5 -count=0 - -# Loop to attempt the update until successful or max retries reached -for ((i=1; i<=MAX_RETRIES; i++)); do - # Attempt to update the entry - sed -i "s/^$HostIP .*/$HostIP $HostName/g" "$LSF_HOSTS_FILE" - - # Validate if the update was successful - if grep -q "^$HostIP $HostName" "$LSF_HOSTS_FILE"; then - echo "Successfully updated $HostIP $HostName in $LSF_HOSTS_FILE." - break - else - echo "Attempt $i: Update failed, retrying..." - sleep 5 - fi - - # Check if max retries reached - if [ "$i" -eq "$MAX_RETRIES" ]; then - echo "Failed to update $HostIP $HostName in $LSF_HOSTS_FILE after $MAX_RETRIES attempts." - exit 1 - fi -done - -for hostname in $mgmt_hostnames; do - while ! grep "$hostname" "/opt/ibm/lsf/conf/hosts"; do - echo "Waiting for $hostname to be added to LSF host file" >> $logfile - sleep 5 - done -done - -# TODO: Understand usage -# Support rc_account resource to enable RC_ACCOUNT policy -if [ -n "${rc_account}" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap ${rc_account}*rc_account]\"/" $LSF_CONF_FILE - echo "Update LSF_LOCAL_RESOURCES lsf.conf successfully, add [resourcemap ${rc_account}*rc_account]" >> $logfile -fi -# Support for multiprofiles for the Job submission -if [ -n "${family}" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap ${family}*family]\"/" $LSF_CONF_FILE - echo "update LSF_LOCAL_RESOURCES lsf.conf successfully, add [resourcemap ${pricing}*family]" >> $logfile -fi -# Add additional local resources if needed -instance_id=$(dmidecode | grep Family | cut -d ' ' -f 2 |head -1) -if [ -n "$instance_id" ]; then - sed -i "s/\(LSF_LOCAL_RESOURCES=.*\)\"/\1 [resourcemap $instance_id*instanceID]\"/" $LSF_CONF_FILE - echo "Update LSF_LOCAL_RESOURCES in $LSF_CONF_FILE successfully, add [resourcemap ${instance_id}*instanceID]" >> $logfile -else - echo "Can not get instance ID" >> $logfile -fi - -# Defining ncpus based on hyper-threading -echo "$hyperthreading" -if [ "$hyperthreading" == true ]; then - ego_define_ncpus="threads" -else - ego_define_ncpus="cores" - cat << 'EOT' > /root/lsf_hyperthreading -#!/bin/sh -for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do - echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" -done -EOT - chmod 755 /root/lsf_hyperthreading - command="/root/lsf_hyperthreading" - sh $command && (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - -fi -echo "EGO_DEFINE_NCPUS=${ego_define_ncpus}" >> "$LSF_CONF_FILE" - -#Update LSF Tuning on dynamic hosts -LSF_TUNABLES="etc/sysctl.conf" -echo 'vm.overcommit_memory=1' >> $LSF_TUNABLES -echo 'net.core.rmem_max=26214400' >> $LSF_TUNABLES -echo 'net.core.rmem_default=26214400' >> $LSF_TUNABLES -echo 'net.core.wmem_max=26214400' >> $LSF_TUNABLES -echo 'net.core.wmem_default=26214400' >> $LSF_TUNABLES -echo 'net.ipv4.tcp_fin_timeout = 5' >> $LSF_TUNABLES -echo 'net.core.somaxconn = 8000' >> $LSF_TUNABLES -sudo sysctl -p $LSF_TUNABLES - -# Setup lsfadmin user -# Updates the lsfadmin user as never expire -chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin -# Setup ssh -lsfadmin_home_dir="/home/lsfadmin" -lsfadmin_ssh_dir="${lsfadmin_home_dir}/.ssh" -mkdir -p ${lsfadmin_ssh_dir} -if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then - sudo cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -else - cp /home/ubuntu/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" - sudo cp /home/ubuntu/.profile /home/lsfadmin -fi -cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -echo "${cluster_public_key_content}" >> "${lsfadmin_ssh_dir}/authorized_keys" -echo "${cluster_private_key_content}" >> "${lsfadmin_ssh_dir}/id_rsa" -echo "StrictHostKeyChecking no" >> "${lsfadmin_ssh_dir}/config" -chmod 600 "${lsfadmin_ssh_dir}/authorized_keys" -chmod 600 "${lsfadmin_ssh_dir}/id_rsa" -chmod 700 ${lsfadmin_ssh_dir} -chown -R lsfadmin:lsfadmin ${lsfadmin_ssh_dir} -echo "SSH key setup for lsfadmin user is completed" >> $logfile - -# Setup root user -root_ssh_dir="/root/.ssh" -echo "${cluster_public_key_content}" >> $root_ssh_dir/authorized_keys -echo "StrictHostKeyChecking no" >> $root_ssh_dir/config -echo "cluster ssh key has been added to root user" >> $logfile - - -# Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -echo 'LSF_STARTUP_USERS="lsfadmin"' | sudo tee -a /etc/lsf1.sudoers -echo "LSF_STARTUP_PATH=$LSF_TOP_VERSION/linux3.10-glibc2.17-x86_64/etc/" | sudo tee -a /etc/lsf.sudoers -chmod 600 /etc/lsf.sudoers -ls -l /etc/lsf.sudoers - -# Change LSF_CONF= value in lsf_daemons -cd /opt/ibm/lsf_worker/10.1/linux3.10-glibc2.17-x86_64/etc/ -sed -i "s|/opt/ibm/lsf/|/opt/ibm/lsf_worker/|g" lsf_daemons -cd - - -sudo ${LSF_TOP}/10.1/install/hostsetup --top="${LSF_TOP}" --setuid ### WARNING: LSF_TOP may be unset here -echo "Added LSF administrators to start LSF daemons" >> $logfile - -# Install LSF as a service and start up -/opt/ibm/lsf_worker/10.1/install/hostsetup --top="/opt/ibm/lsf_worker" --boot="y" --start="y" --dynamic 2>&1 >> $logfile -systemctl status lsfd -cat /opt/ibm/lsf/conf/hosts >> /etc/hosts - -lsfadmin_home_dir="/home/lsfadmin" -echo "source ${LSF_CONF}/profile.lsf" >> /root/.bashrc -echo "source ${LSF_CONF}/profile.lsf" >> "${lsfadmin_home_dir}"/.bashrc - -# -## Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -#cat < "/etc/lsf.sudoers" -#LSF_STARTUP_USERS="lsfadmin" -#LSF_STARTUP_PATH=$LSF_TOP_VERSION/linux3.10-glibc2.17-x86_64/etc/ -#EOT -#chmod 600 /etc/lsf.sudoers -#ls -l /etc/lsf.sudoers -# -#$LSF_TOP_VERSION/install/hostsetup --top="$LSF_TOP" --setuid -#echo "Added LSF administrators to start LSF daemons" -# -#lsfadmin_home_dir="/home/lsfadmin" -#echo "source ${LSF_CONF}/profile.lsf" >> /root/.bashrc -#echo "source ${LSF_CONF}/profile.lsf" >> "${lsfadmin_home_dir}"/.bashrc -## Setup ssh -# -# -## Setup lsfadmin user -## Updates the lsfadmin user as never expire -#chage -I -1 -m 0 -M 99999 -E -1 -W 14 lsfadmin -## Setup ssh -#lsfadmin_home_dir="/home/lsfadmin" -#lsfadmin_ssh_dir="${lsfadmin_home_dir}/.ssh" -#mkdir -p ${lsfadmin_ssh_dir} -#if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release; then -# sudo cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -#else -# cp /home/ubuntu/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -# sudo cp /home/ubuntu/.profile /home/lsfadmin -#fi -#cp /home/vpcuser/.ssh/authorized_keys "${lsfadmin_ssh_dir}/authorized_keys" -#echo "${cluster_public_key_content}" >> "${lsfadmin_ssh_dir}/authorized_keys" -#echo "${cluster_private_key_content}" >> "${lsfadmin_ssh_dir}/id_rsa" -#echo "StrictHostKeyChecking no" >> "${lsfadmin_ssh_dir}/config" -#chmod 600 "${lsfadmin_ssh_dir}/authorized_keys" -#chmod 600 "${lsfadmin_ssh_dir}/id_rsa" -#chmod 700 ${lsfadmin_ssh_dir} -#chown -R lsfadmin:lsfadmin ${lsfadmin_ssh_dir} -#echo "SSH key setup for lsfadmin user is completed" >> $logfile -# -## Setup root user -#root_ssh_dir="/root/.ssh" -#echo "${cluster_public_key_content}" >> $root_ssh_dir/authorized_keys -#echo "StrictHostKeyChecking no" >> $root_ssh_dir/config -#echo "cluster ssh key has been added to root user" >> $logfile -# -## Update LSF Tunables -#LSF_TUNABLES="/etc/sysctl.conf" -#echo "1" > /proc/sys/vm/overcommit_memory -#echo 'vm.overcommit_memory=1' > "$LSF_TUNABLES" -#echo 'net.core.rmem_max=26214400' >> "$LSF_TUNABLES" -#echo 'net.core.rmem_default=26214400' >> "$LSF_TUNABLES" -#echo 'net.core.wmem_max=26214400' >> "$LSF_TUNABLES" -#echo 'net.core.wmem_default=26214400' >> "$LSF_TUNABLES" -#echo 'net.ipv4.tcp_fin_timeout = 5' >> "$LSF_TUNABLES" -#echo 'net.core.somaxconn = 8000' >> "$LSF_TUNABLES" -#sysctl -p "$LSF_TUNABLES" -# -## Defining ncpus based on hyper-threading -#echo "$hyperthreading" -#if [ "$hyperthreading" == true ]; then -# ego_define_ncpus="threads" -#else -# ego_define_ncpus="cores" -# cat << 'EOT' > /root/lsf_hyperthreading -##!/bin/sh -#for vcpu in $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | cut -s -d- -f2 | cut -d- -f2 | uniq); do -# echo "0" > "/sys/devices/system/cpu/cpu"$vcpu"/online" -#done -#EOT -# chmod 755 /root/lsf_hyperthreading -# command="/root/lsf_hyperthreading" -# sh $command && (crontab -l 2>/dev/null; echo "@reboot $command") | crontab - -#fi -#echo "EGO_DEFINE_NCPUS=${ego_define_ncpus}" >> "$LSF_CONF_FILE" -# -## Update lsf configuration -#echo 'LSB_MC_DISABLE_HOST_LOOKUP=Y' >> "$LSF_CONF_FILE" -#sed -i "s/LSF_LOCAL_RESOURCES/#LSF_LOCAL_RESOURCES/" "$LSF_CONF_FILE" -#echo "LSF_RSH=\"ssh -o 'PasswordAuthentication no' -o 'StrictHostKeyChecking no'\"" >> "$LSF_CONF_FILE" -##sed -i "s/LSF_SERVER_HOSTS=.*/LSF_SERVER_HOSTS=\"$ManagementHostNames\"/g" "$LSF_CONF_FILE" -#echo "LSF_SERVER_HOSTS=\"$mgmt_hostnames\"" >> "$LSF_CONF_FILE" -# -#cat << EOF > /etc/profile.d/lsf.sh -#ls /opt/ibm/lsf_worker/conf/lsf.conf > /dev/null 2> /dev/null < /dev/null & -##usleep 10000 -#PID=\$! -#if kill -0 \$PID 2> /dev/null; then -# # lsf.conf is not accessible -# kill -KILL \$PID 2> /dev/null > /dev/null -# wait \$PID -#else -# source /opt/ibm/lsf_worker/conf/profile.lsf -#fi -#PATHs=\`echo "\$PATH" | sed -e 's/:/\n/g'\` -#for path in /usr/local/bin /usr/bin /usr/local/sbin /usr/sbin; do -# PATHs=\`echo "\$PATHs" | grep -v \$path\` -#done -#export PATH=/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:\`echo "\$PATHs" | paste -s -d :\` -#EOF -# -##sed -i "s/^$HostIP .*/$HostIP $HostName/g" /opt/ibm/lsf/conf/hosts -##for hostname in $mgmt_hostnames; do -## while ! grep "$hostname" "$LSF_HOSTS_FILE"; do -## echo "Waiting for $hostname to be added to LSF host file" -## sleep 5 -## done -## echo "$hostname found in LSF host file" -##done -##cat $LSF_HOSTS_FILE >> /etc/hosts -# -## Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -## Create lsf.sudoers file to support single lsfstartup and lsfrestart command from management node -#echo 'LSF_STARTUP_USERS="lsfadmin"' | sudo tee -a /etc/lsf1.sudoers -#echo "LSF_STARTUP_PATH=$LSF_TOP_VERSION/linux3.10-glibc2.17-x86_64/etc/" | sudo tee -a /etc/lsf.sudoers -#chmod 600 /etc/lsf.sudoers -#ls -l /etc/lsf.sudoers -# -## Change LSF_CONF= value in lsf_daemons -#cd /opt/ibm/lsf_worker/10.1/linux3.10-glibc2.17-x86_64/etc/ -#sed -i "s|/opt/ibm/lsf/|/opt/ibm/lsf_worker/|g" lsf_daemons -#cd - -# -#sudo /opt/ibm/lsf/10.1/install/hostsetup --top="${LSF_TOP}" --setuid ### WARNING: LSF_TOP may be unset here -#echo "Added LSF administrators to start LSF daemons" >> $logfile -# -## Install LSF as a service and start up -#/opt/ibm/lsf_worker/10.1/install/hostsetup --top="/opt/ibm/lsf_worker" --boot="y" --start="y" --dynamic 2>&1 >> $logfile -#systemctl status lsfd -#cat /opt/ibm/lsf/conf/hosts >> /etc/hosts - - -# Setting up the LDAP configuration -# Setting up the LDAP configuration -if [ "$enable_ldap" = "true" ]; then - - # Detect if the operating system is RHEL or Rocky Linux - if grep -q "NAME=\"Red Hat Enterprise Linux\"" /etc/os-release || grep -q "NAME=\"Rocky Linux\"" /etc/os-release; then - - # Detect RHEL or Rocky version - version=$(grep -oE 'release [0-9]+' /etc/redhat-release | awk '{print $2}') - - # Proceed if the detected version is either 8 or 9 - if [ "$version" == "8" ] || [ "$version" == "9" ]; then - echo "Detected as RHEL or Rocky $version. Proceeding with LDAP client configuration..." >> $logfile - - # Enable password authentication for SSH by modifying the configuration file - sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - systemctl restart sshd - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries - for attempt in {1..5}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - mkdir -p /etc/openldap/certs/ - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/openldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 5 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 5 attempts. Exiting." >> $logfile; exit 1; } - - - # Create and configure the SSSD configuration file for LDAP integration - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/openldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - chmod 600 /etc/sssd/sssd.conf - chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS - cat < /etc/openldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/openldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/openldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/openldap/certs - - # Apply the SSSD and home directory creation configuration using authselect - authselect select sssd with-mkhomedir --force - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - systemctl enable --now sssd oddjobd - - # Restart both services to apply the configuration - systemctl restart sssd oddjobd - - # Validate the LDAP configuration by performing a test search using ldapsearch - if ldapsearch -x -H ldap://"${ldap_server_ip}"/ -b "dc=${base_dn%%.*},dc=${base_dn#*.}" > /dev/null; then - echo "LDAP configuration completed successfully!" >> $logfile - else - echo "LDAP configuration failed! Exiting." >> $logfile - exit 1 - fi - - # Ensure LSF commands are available to all users by adding the profile to bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bashrc - source /etc/bashrc - - else - echo "This script is intended for RHEL and Rocky Linux 8 or 9. Detected version: $version. Exiting." >> $logfile - exit 1 - fi - - # Detect if the operating system is Ubuntu - elif grep -q "NAME=\"Ubuntu\"" /etc/os-release; then - # Log detected OS - echo "Detected as Ubuntu. Proceeding with LDAP client configuration..." >> $logfile - - # Allow password authentication for SSH in two configuration files, then restart the SSH service - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config - sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config.d/50-cloudimg-settings.conf - sudo systemctl restart ssh - - # Add configuration for automatic home directory creation to the PAM session configuration file - sudo sed -i '$ i\session required pam_mkhomedir.so skel=/etc/skel umask=0022\' /etc/pam.d/common-session - - # Check if the SSL certificate file exists, then copy it to the correct location - # Retry finding SSL certificate with a maximum of 5 attempts and 5 seconds sleep between retries - for attempt in {1..5}; do - if [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ]; then - mkdir -p /etc/ldap/certs/ - echo "LDAP SSL cert found under /mnt/lsf/openldap/ldap_cacert.pem path" >> $logfile - cp -pr /mnt/lsf/openldap/ldap_cacert.pem /etc/ldap/certs/ldap_cacert.pem - break - else - echo "SSL cert not found on attempt $attempt. Retrying in 5 seconds..." >> $logfile - sleep 5 - fi - done - # Exit if the SSL certificate is still not found after 5 attempts - [ -f "/mnt/lsf/openldap/ldap_cacert.pem" ] || { echo "SSL cert not found after 5 attempts. Exiting." >> $logfile; exit 1; } - - # Create and configure the SSSD configuration file for LDAP integration on Ubuntu - cat < /etc/sssd/sssd.conf -[sssd] -config_file_version = 2 -services = nss, pam, autofs -domains = default - -[nss] -homedir_substring = /home - -[pam] - -[domain/default] -id_provider = ldap -autofs_provider = ldap -auth_provider = ldap -chpass_provider = ldap -ldap_uri = ldap://${ldap_server_ip} -ldap_search_base = dc=${base_dn%%.*},dc=${base_dn#*.} -ldap_id_use_start_tls = True -ldap_tls_cacertdir = /etc/ldap/certs -cache_credentials = True -ldap_tls_reqcert = allow -EOF - - # Secure the SSSD configuration file by setting appropriate permissions - sudo chmod 600 /etc/sssd/sssd.conf - sudo chown root:root /etc/sssd/sssd.conf - - # Create and configure the OpenLDAP configuration file for TLS on Ubuntu - cat < /etc/ldap/ldap.conf -BASE dc=${base_dn%%.*},dc=${base_dn#*.} -URI ldap://${ldap_server_ip} -TLS_CACERT /etc/ldap/certs/ldap_cacert.pem -TLS_CACERTDIR /etc/ldap/certs -EOF - - # Rehash certificates in the OpenLDAP directory to ensure proper recognition - openssl rehash /etc/ldap/certs - - # Enable and start the SSSD and oddjobd services for user authentication and home directory management - sudo systemctl enable --now sssd oddjobd && sudo systemctl restart sssd oddjobd - - # Ensure LSF commands are available to all users by adding the profile to bash.bashrc - echo ". ${LSF_CONF}/profile.lsf" >> /etc/bash.bashrc - source /etc/bash.bashrc - - # Validate the LDAP configuration by checking the status of the SSSD service - if sudo systemctl is-active --quiet sssd; then - echo "LDAP client configuration completed successfully!" >> $logfile - else - echo "LDAP client configuration failed! Exiting." >> $logfile - exit 1 - fi - - else - echo "This script is designed for RHEL, Rocky Linux, or Ubuntu. Unsupported OS detected. Exiting." >> $logfile - exit 1 - fi -fi - -systemctl status lsfd >> "$logfile" - -# Setting up the Metrics Agent - -if [ "$cloud_monitoring_access_key" != "" ] && [ "$cloud_monitoring_ingestion_url" != "" ]; then - - SYSDIG_CONFIG_FILE="/opt/draios/etc/dragent.yaml" - - #packages installation - echo "Writing sysdig config file" >> "$logfile" - - #sysdig config file - echo "Setting customerid access key" >> "$logfile" - sed -i "s/==ACCESSKEY==/$cloud_monitoring_access_key/g" $SYSDIG_CONFIG_FILE - sed -i "s/==COLLECTOR==/$cloud_monitoring_ingestion_url/g" $SYSDIG_CONFIG_FILE - echo "tags: type:compute,lsf:true" >> $SYSDIG_CONFIG_FILE -else - echo "Skipping metrics agent configuration due to missing parameters" >> "$logfile" -fi - -if [ "$observability_monitoring_on_compute_nodes_enable" = true ]; then - - echo "Restarting sysdig agent" >> "$logfile" - systemctl enable dragent - systemctl restart dragent - else - echo "Metrics agent start skipped since monitoring provisioning is not enabled" >> "$logfile" -fi - -# Setting up the IBM Cloud Logs -if [ "$observability_logs_enable_for_compute" = true ]; then - - echo "Configuring cloud logs for compute since observability logs for compute is enabled" - sudo cp /root/post-config.sh /opt/ibm - cd /opt/ibm - - cat < /etc/fluent-bit/fluent-bit.conf -[SERVICE] - Flush 1 - Log_Level info - Daemon off - Parsers_File parsers.conf - Plugins_File plugins.conf - HTTP_Server On - HTTP_Listen 0.0.0.0 - HTTP_Port 9001 - Health_Check On - HC_Errors_Count 1 - HC_Retry_Failure_Count 1 - HC_Period 30 - storage.path /fluent-bit/cache - storage.max_chunks_up 192 - storage.metrics On - -[INPUT] - Name syslog - Path /tmp/in_syslog - Buffer_Chunk_Size 32000 - Buffer_Max_Size 64000 - Receive_Buffer_Size 512000 - -[INPUT] - Name tail - Tag * - Path /opt/ibm/lsf_worker/log/*.log - Path_Key file - Exclude_Path /var/log/at/** - DB /opt/ibm/lsf_worker/log/fluent-bit.DB - Buffer_Chunk_Size 32KB - Buffer_Max_Size 256KB - Skip_Long_Lines On - Refresh_Interval 10 - storage.type filesystem - storage.pause_on_chunks_overlimit on - -[FILTER] - Name modify - Match * - Add subsystemName compute - Add applicationName lsf - -@INCLUDE output-logs-router-agent.conf -EOL - - sudo chmod +x post-config.sh - sudo ./post-config.sh -h $cloud_logs_ingress_private_endpoint -p "3443" -t "/logs/v1/singles" -a IAMAPIKey -k $VPC_APIKEY_VALUE --send-directly-to-icl -s true -i Production - sudo echo "2024-10-16T14:31:16+0000 INFO Testing IBM Cloud LSF Logs from compute: $HostName" >> /opt/ibm/lsf_worker/log/test.log - sudo logger -u /tmp/in_syslog my_ident my_syslog_test_message_from_compute:$HostName - -else - echo "Cloud Logs configuration skipped since observability logs for compute is not enabled" -fi - -echo "END $(date '+%Y-%m-%d %H:%M:%S')" >> "$logfile" diff --git a/modules/landing_zone_vsi/templates/storage_user_data.tpl b/modules/landing_zone_vsi/templates/storage_user_data.tpl index 4d7c8cc6..3b33285e 100644 --- a/modules/landing_zone_vsi/templates/storage_user_data.tpl +++ b/modules/landing_zone_vsi/templates/storage_user_data.tpl @@ -6,6 +6,8 @@ ################################################### #!/usr/bin/env bash +exec > >(tee /var/log/ibm_spectrumscale_user-data.log) + if grep -E -q "CentOS|Red Hat" /etc/os-release then USER=vpcuser @@ -13,7 +15,7 @@ elif grep -q "Ubuntu" /etc/os-release then USER=ubuntu fi -sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please login as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys +sed -i -e "s/^/no-port-forwarding,no-agent-forwarding,no-X11-forwarding,command=\"echo \'Please client as the user \\\\\"$USER\\\\\" rather than the user \\\\\"root\\\\\".\';echo;sleep 5; exit 142\" /" /root/.ssh/authorized_keys # input parameters echo "${bastion_public_key_content}" >> ~/.ssh/authorized_keys @@ -22,22 +24,113 @@ echo "StrictHostKeyChecking no" >> ~/.ssh/config echo "${storage_private_key_content}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa -# scale pre-requisite installation +# if grep -q "Red Hat" /etc/os-release if grep -q "CentOS|Red Hat" /etc/os-release then USER=vpcuser - if grep -q "platform:el8" /etc/os-release + REQ_PKG_INSTALLED=0 + if grep -q "platform:el9" /etc/os-release + then + PACKAGE_MGR=dnf + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables-nft nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" + elif grep -q "platform:el8" /etc/os-release then PACKAGE_MGR=dnf - package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + package_list="python38 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl jq make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel python3-dnf-plugin-versionlock" else PACKAGE_MGR=yum - package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + package_list="python3 kernel-devel-$(uname -r) kernel-headers-$(uname -r) firewalld numactl make gcc-c++ elfutils-libelf-devel bind-utils iptables nfs-utils elfutils elfutils-devel yum-plugin-versionlock" fi + + RETRY_LIMIT=5 + retry_count=0 + all_pkg_installed=1 + + while [[ $all_pkg_installed -ne 0 && $retry_count -lt $RETRY_LIMIT ]] + do + # Install all required packages + echo "INFO: Attempting to install packages" + $PACKAGE_MGR install -y $package_list + + # Check to ensure packages are installed + pkg_installed=0 + for pkg in $package_list + do + pkg_query=$($PACKAGE_MGR list installed $pkg) + pkg_installed=$(($? + $pkg_installed)) + done + if [[ $pkg_installed -ne 0 ]] + then + # The minimum required packages have not been installed. + echo "WARN: Required packages not installed. Sleeping for 60 seconds and retrying..." + touch /var/log/scale-rerun-package-install + echo "INFO: Cleaning and repopulating repository data" + $PACKAGE_MGR clean all + $PACKAGE_MGR makecache + sleep 60 + else + all_pkg_installed=0 + fi + retry_count=$(( $retry_count+1 )) + done + +elif grep -q "Ubuntu" /etc/os-release +then + USER=ubuntu +fi + +yum update --security -y +yum versionlock $package_list +yum versionlock list +echo 'export PATH=$PATH:/usr/lpp/mmfs/bin' >> /root/.bashrc + +if [[ "${storage_disk_type}" == "fixed" ]] +then + echo "###########################################################################################" >> /etc/motd + echo "# You have logged in to Instance storage virtual server. #" >> /etc/motd + echo "# - Instance storage is temporary storage that's available only while your virtual #" >> /etc/motd + echo "# server is running. #" >> /etc/motd + echo "# - Data on the drive is unrecoverable after instance shutdown, disruptive maintenance, #" >> /etc/motd + echo "# or hardware failure. #" >> /etc/motd + echo "# #" >> /etc/motd + echo "# Refer: https://cloud.ibm.com/docs/vpc?topic=vpc-instance-storage #" >> /etc/motd + echo "###########################################################################################" >> /etc/motd fi -# network setup echo "DOMAIN=${storage_dns_domain}" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-${storage_interfaces}" chage -I -1 -m 0 -M 99999 -E -1 -W 14 vpcuser +sleep 120 systemctl restart NetworkManager + +systemctl stop firewalld +firewall-offline-cmd --zone=public --add-port=1191/tcp +firewall-offline-cmd --zone=public --add-port=4444/tcp +firewall-offline-cmd --zone=public --add-port=4444/udp +firewall-offline-cmd --zone=public --add-port=4739/udp +firewall-offline-cmd --zone=public --add-port=4739/tcp +firewall-offline-cmd --zone=public --add-port=9084/tcp +firewall-offline-cmd --zone=public --add-port=9085/tcp +firewall-offline-cmd --zone=public --add-service=http +firewall-offline-cmd --zone=public --add-service=https +firewall-offline-cmd --zone=public --add-port=2049/tcp +firewall-offline-cmd --zone=public --add-port=2049/udp +firewall-offline-cmd --zone=public --add-port=111/tcp +firewall-offline-cmd --zone=public --add-port=111/udp +firewall-offline-cmd --zone=public --add-port=30000-61000/tcp +firewall-offline-cmd --zone=public --add-port=30000-61000/udp +systemctl start firewalld +systemctl enable firewalld + +if [ "${enable_protocol}" == true ]; then + sec_interface=$(nmcli -t con show --active | grep eth1 | cut -d ':' -f 1) + nmcli conn del "$sec_interface" + nmcli con add type ethernet con-name eth1 ifname eth1 + echo "DOMAIN=\"${protocol_dns_domain}\"" >> "/etc/sysconfig/network-scripts/ifcfg-eth1" + echo "MTU=9000" >> "/etc/sysconfig/network-scripts/ifcfg-eth1" + systemctl restart NetworkManager + ###### TODO: Fix Me ###### + echo 'export IC_REGION=${vpc_region}' >> /root/.bashrc + echo 'export IC_SUBNET=${protocol_subnets}' >> /root/.bashrc + echo 'export IC_RG=${resource_group_id}' >> /root/.bashrc +fi diff --git a/modules/landing_zone_vsi/variables.tf b/modules/landing_zone_vsi/variables.tf index afaa04bb..b22f7e97 100644 --- a/modules/landing_zone_vsi/variables.tf +++ b/modules/landing_zone_vsi/variables.tf @@ -1,21 +1,10 @@ ############################################################################## # Offering Variations ############################################################################## -# variable "storage_type" { -# type = string -# default = "scratch" -# description = "Select the required storage type(scratch/persistent/eval)." -# } - -############################################################################## -# Account Variables -############################################################################## - -variable "ibmcloud_api_key" { - description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +variable "storage_type" { type = string - sensitive = true - default = null + default = "scratch" + description = "Select the required storage type(scratch/persistent/eval)." } ############################################################################## @@ -47,6 +36,13 @@ variable "zones" { type = list(string) } + +variable "cluster_cidr" { + description = "Network CIDR of the VPC. This is used to manage network security rules for cluster provisioning." + type = string + default = "10.241.0.0/18" +} + ############################################################################## # VPC Variables ############################################################################## @@ -56,15 +52,16 @@ variable "vpc_id" { description = "ID of an existing VPC in which the cluster resources will be deployed." } +variable "placement_group_ids" { + type = string + default = null + description = "VPC placement group ids" +} + ############################################################################## # Access Variables ############################################################################## -variable "bastion_fip" { - type = string - description = "Bastion FIP." -} - variable "bastion_security_group_id" { type = string description = "Bastion security group id." @@ -77,57 +74,216 @@ variable "bastion_public_key_content" { description = "Bastion security group id." } -variable "cluster_user" { - type = string - description = "Linux user for cluster administration." -} - -variable "compute_private_key_content" { - type = string - description = "Compute private key content" -} - -variable "bastion_private_key_content" { +variable "storage_security_group_id" { type = string - description = "Bastion private key content" + default = null + description = "Existing Scale storage security group id" } ############################################################################## # Compute Variables ############################################################################## -variable "compute_subnets" { +variable "client_subnets" { + type = list(object({ + name = string + id = string + zone = string + cidr = string + })) + default = [] + description = "Subnets to launch the client hosts." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." +} + +variable "cluster_subnet_id" { type = list(object({ name = string id = string zone = string cidr = string - crn = string })) default = [] description = "Subnets to launch the compute host." } -variable "compute_ssh_keys" { +variable "ssh_keys" { type = list(string) + default = [] description = "The key pair to use to launch the compute host." } -variable "management_image_name" { - type = string - description = "Image name to use for provisioning the management cluster instances." +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 250 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +# variable "compute_gui_username" { +# type = string +# default = "admin" +# sensitive = true +# description = "GUI user to perform system management and monitoring tasks on compute cluster." +# } + +# variable "compute_gui_password" { +# type = string +# default = "hpc@IBMCloud" +# sensitive = true +# description = "Password for compute cluster GUI" +# } + +############################################################################## +# Scale Storage Variables +############################################################################## + +variable "storage_subnets" { + type = list(object({ + name = string + id = string + zone = string + cidr = string + })) + default = [] + description = "Subnets to launch the storage host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem_name = optional(string) + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem_name = "fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "storage_servers" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "cx2d-metal-96x192" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "fs1" + }] + description = "Number of BareMetal Servers to be launched for storage cluster." +} + +variable "protocol_subnets" { + type = list(object({ + name = string + id = string + zone = string + cidr = string + })) + default = [] + description = "Subnets to launch the bastion host." } -variable "compute_image_name" { - type = string - default = "hpcaas-lsf10-rhel810-compute-v8" - description = "Image name to use for provisioning the compute cluster instances." +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for protocol hosts." } -variable "login_image_name" { - type = string - default = "hpcaas-lsf10-rhel810-compute-v8" - description = "Image name to use for provisioning the login instance." +variable "colocate_protocol_instances" { + type = bool + default = true + description = "Enable it to use storage instances as protocol instances" +} + +variable "nsd_details" { + type = list( + object({ + profile = string + capacity = optional(number) + iops = optional(number) + }) + ) + default = null + description = "NSD details" } ############################################################################## @@ -136,14 +292,18 @@ variable "login_image_name" { variable "dns_domain_names" { type = object({ - compute = string - #storage = string - #protocol = string + compute = string + storage = optional(string) + protocol = optional(string) + client = optional(string) + gklm = optional(string) }) default = { compute = "comp.com" storage = "strg.com" protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" } description = "IBM Cloud HPC DNS domain names." } @@ -165,332 +325,177 @@ variable "boot_volume_encryption_key" { description = "CRN of boot volume encryption key" } -variable "cluster_id" { - type = string - description = "Ensure that you have received the cluster ID from IBM technical sales. A unique identifer for HPC cluster used by IBM Cloud HPC to differentiate different HPC clusters within the same contract. This can be up to 39 alphanumeric characters including the underscore (_), the hyphen (-), and the period (.) characters. You cannot change the cluster ID after deployment." -} - -variable "contract_id" { - type = string - sensitive = true - description = "Ensure that you have received the contract ID from IBM technical sales. Contract ID is a unique identifier to distinguish different IBM Cloud HPC service agreements. It must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (_)." -} - -variable "hyperthreading_enabled" { - type = bool - default = true - description = "Setting this to true will enable hyper-threading in the compute nodes of the cluster (default). Otherwise, hyper-threading will be disabled." -} - -variable "enable_app_center" { - type = bool - default = false - description = "Set to true to enable the IBM Spectrum LSF Application Center GUI (default: false). [System requirements](https://www.ibm.com/docs/en/slac/10.2.0?topic=requirements-system-102-fix-pack-14) for IBM Spectrum LSF Application Center Version 10.2 Fix Pack 14." -} - -variable "app_center_gui_pwd" { - type = string - sensitive = true - default = "" - description = "Password for IBM Spectrum LSF Application Center GUI. Note: Password should be at least 8 characters, must have one number, one lowercase letter, one uppercase letter, and at least one special character." -} - -variable "management_node_count" { - type = number - default = 3 - description = "Number of management nodes. This is the total number of management nodes. Enter a value between 1 and 10." - validation { - condition = 1 <= var.management_node_count && var.management_node_count <= 10 - error_message = "Input \"management_node_count\" must be must be greater than or equal to 1 and less than or equal to 10." - } -} - -variable "management_node_instance_type" { - type = string - default = "bx2-16x64" - description = "Specify the virtual server instance profile type to be used to create the management nodes for the IBM Cloud HPC cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - validation { - condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.management_node_instance_type)) - error_message = "The profile must be a valid profile name." - } -} - -variable "share_path" { +variable "existing_kms_instance_guid" { type = string - description = "Provide the exact path to where the VPC file share needs to be mounted" -} - -variable "mount_path" { - type = list(object({ - mount_path = string, - size = optional(number), - iops = optional(number), - nfs_share = optional(string) - })) - description = "Provide the path for the vpc file share to be mounted on to the HPC Cluster nodes" -} - -variable "file_share" { - type = list(string) - description = "VPC file share mount points considering the ip address and the file share name" + default = null + description = "The existing KMS instance guid." } -variable "login_private_ips" { - description = "Login private IPs" - type = string -} +############################################################################## +# TODO: Auth Server (LDAP/AD) Variables +############################################################################## -variable "login_node_instance_type" { - type = string - default = "bx2-2x8" - description = "Specify the virtual server instance profile type to be used to create the login node for the IBM Cloud HPC cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - validation { - condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.login_node_instance_type)) - error_message = "The profile must be a valid profile name." - } -} +# variable "compute_public_key_content" { +# type = string +# sensitive = true +# default = null +# description = "Compute security key content." +# } -variable "bastion_subnets" { - type = list(object({ - name = string - id = string - zone = string - cidr = string - })) - default = [] - description = "Subnets to launch the bastion host." -} +# variable "compute_private_key_content" { +# type = string +# sensitive = true +# default = null +# description = "Compute security key content." +# } -variable "bastion_ssh_keys" { - type = list(string) - description = "The key pair to use to access the host." +variable "enable_deployer" { + type = bool + default = true + description = "Deployer should be only used for better deployment performance" } +############################################################################# +# LDAP variables +############################################################################## variable "enable_ldap" { type = bool default = false description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." } -variable "ldap_basedns" { - type = string - default = "hpcaas.com" - description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." -} - variable "ldap_server" { type = string - default = "null" + default = null description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." } -variable "ldap_server_cert" { - type = string - sensitive = true - default = "null" - description = "Provide the existing LDAP server certificate. If not provided, the value should be set to 'null'." -} - -variable "ldap_admin_password" { - type = string - sensitive = true - default = "" - description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required. It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." -} - -variable "ldap_user_name" { - type = string - default = "" - description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server]" -} - -variable "ldap_user_password" { - type = string - sensitive = true - default = "" - description = "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." -} - -variable "ldap_vsi_profile" { - type = string - default = "cx2-2x4" - description = "Profile to be used for LDAP virtual server instance." -} - -variable "ldap_vsi_osimage_name" { - type = string - default = "ibm-ubuntu-22-04-4-minimal-amd64-3" - description = "Image name to be used for provisioning the LDAP instances." -} - -variable "ldap_primary_ip" { +variable "ldap_instance_key_pair" { type = list(string) - description = "List of LDAP primary IPs." -} - -############################################################################## -# High Availability -############################################################################## -variable "app_center_high_availability" { - type = bool - default = true - description = "Set to false to disable the IBM Spectrum LSF Application Center GUI High Availability (default: true) ." -} - -########################################################################### -# IBM Cloud Dababase for MySQL Instance variables -########################################################################### -variable "db_instance_info" { - description = "The IBM Cloud Database for MySQL information required to reference the PAC database." - type = object({ - id = string - admin_user = string - hostname = string - port = number - certificate = string - }) - default = null -} - -variable "db_admin_password" { - type = string default = null - sensitive = true - description = "The IBM Cloud Database for MySQL password required to reference the PAC database." -} - -variable "storage_security_group_id" { - type = string - default = null - description = "Existing Scale storage security group id" + description = "Name of the SSH key configured in your IBM Cloud account that is used to establish a connection to the LDAP Server. Make sure that the SSH key is present in the same resource group and region where the LDAP Servers are provisioned. If you do not have an SSH key in your IBM Cloud account, create one by using the [SSH keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys) instructions." +} + +variable "ldap_instances" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + image = "ibm-ubuntu-22-04-5-minimal-amd64-1" + }] + description = "Profile and Image name to be used for provisioning the LDAP instances. Note: Debian based OS are only supported for the LDAP feature" +} + +variable "afm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-32x128" + count = 1 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for afm hosts." } ############################################################################## -# Observability Variables +# GKLM variables ############################################################################## - -variable "observability_monitoring_enable" { - description = "Set true to enable IBM Cloud Monitoring instance provisioning." +variable "scale_encryption_enabled" { type = bool default = false + description = "To enable the encryption for the filesystem. Select true or false" } -variable "observability_monitoring_on_compute_nodes_enable" { - description = "Set true to enable IBM Cloud Monitoring on Compute Nodes." - type = bool - default = false -} - -variable "cloud_monitoring_access_key" { - description = "IBM Cloud Monitoring access key for agents to use" - type = string - sensitive = true -} - -variable "cloud_monitoring_ingestion_url" { - description = "IBM Cloud Monitoring ingestion url for agents to use" - type = string -} - -variable "cloud_monitoring_prws_key" { - description = "IBM Cloud Monitoring Prometheus Remote Write ingestion key" - type = string - sensitive = true -} - -variable "cloud_monitoring_prws_url" { - description = "IBM Cloud Monitoring Prometheus Remote Write ingestion url" +variable "scale_encryption_type" { type = string + default = null + description = "To enable filesystem encryption, specify either 'key_protect' or 'gklm'. If neither is specified, the default value will be 'null' and encryption is disabled" } -########################################################################### -# Existing Bastion Support variables -########################################################################### - -variable "bastion_instance_name" { - type = string +variable "gklm_instance_key_pair" { + type = list(string) default = null - description = "Bastion instance name." + description = "The key pair to use to launch the GKLM host." } -############################################################################## -# Code Engine Variables -############################################################################## - -variable "ce_project_guid" { - description = "The GUID of the Code Engine Project associated to this cluster Reservation" - type = string +variable "gklm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." } -variable "existing_kms_instance_guid" { +variable "vpc_region" { type = string default = null - description = "GUID of boot volume encryption key" + description = "vpc region" } -variable "cloud_logs_ingress_private_endpoint" { - description = "String describing resource groups to create or reference" +variable "scheduler" { type = string default = null + description = "Select one of the scheduler (LSF/Symphony/Slurm/null)" } -variable "observability_logs_enable_for_management" { - description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested." - type = bool - default = false -} - -variable "observability_logs_enable_for_compute" { - description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested." - type = bool - default = false -} - -variable "solution" { +variable "ibm_customer_number" { type = string - default = "lsf" - description = "Provide the value for the solution that is needed for the support of lsf and HPC" + sensitive = true + default = null + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." } -variable "worker_node_max_count" { - type = number - default = 10 - description = "The maximum number of worker nodes that can be deployed in the Spectrum LSF cluster. In order to use the [Resource Connector](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=lsf-resource-connnector) feature to dynamically create and delete worker nodes based on workload demand, the value selected for this parameter must be larger than worker_node_min_count. If you plan to deploy only static worker nodes in the LSF cluster, e.g., when using Spectrum Scale storage, the value for this parameter should be equal to worker_node_min_count. Enter a value in the range 1 - 500." - validation { - condition = 1 <= var.worker_node_max_count && var.worker_node_max_count <= 500 - error_message = "Input \"worker_node_max_count must\" be >= 1 and <= 500." - } -} ############################################################################## -# Dedicated Host +# Dedicatedhost Variables ############################################################################## variable "enable_dedicated_host" { type = bool default = false - description = "Set this option to true to enable dedicated hosts for the VSI created for workload servers, with the default value set to false." + description = "Enables dedicated host to the compute instances" } -variable "dedicated_host_id" { - type = string - description = "Dedicated Host for the worker nodes" - default = null +############################################################################## +# Login Variables +############################################################################## +variable "login_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + image = "hpcaas-lsf10-rhel810-compute-v8" + }] + description = "Number of instances to be launched for login node." } -variable "worker_node_instance_type" { +variable "bastion_subnets" { type = list(object({ - count = number - instance_type = string + name = string + id = string + zone = string + cidr = string })) - description = "The minimum number of worker nodes refers to the static worker nodes provisioned during cluster creation. The solution supports various instance types, so specify the node count based on the requirements of each instance profile. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - default = [ - { - count = 3 - instance_type = "bx2-4x16" - }, - { - count = 0 - instance_type = "cx2-8x16" - } - ] + default = [] + description = "Subnets to launch the bastion host." } diff --git a/modules/landing_zone_vsi/version.tf b/modules/landing_zone_vsi/version.tf index 6127d54e..85708b52 100644 --- a/modules/landing_zone_vsi/version.tf +++ b/modules/landing_zone_vsi/version.tf @@ -1,9 +1,22 @@ +############################################################################## +# Terraform Providers +############################################################################## + terraform { required_version = ">= 1.9.0" + # Use "greater than or equal to" range for root level modules required_providers { ibm = { source = "IBM-Cloud/ibm" - version = ">= 1.56.2" + version = ">= 1.68.1, < 2.0.0" + } + local = { + source = "hashicorp/local" + version = "~> 2" + } + null = { + source = "hashicorp/null" + version = ">= 3.0.0" } template = { source = "hashicorp/template" diff --git a/modules/ldap_remote_exec/main.tf b/modules/ldap_remote_exec/main.tf new file mode 100644 index 00000000..205fbf19 --- /dev/null +++ b/modules/ldap_remote_exec/main.tf @@ -0,0 +1,28 @@ +# Load and render the LDAP connection validation script template +data "template_file" "ldap_connection_script" { + template = file("${path.module}/validate_ldap_connection.tpl") + + vars = { + ldap_server = var.ldap_server + } +} + +# Resource to validate the LDAP server connection using remote-exec +resource "null_resource" "validate_ldap_server_connection" { + connection { + type = "ssh" + host = var.deployer_ip + user = "vpcuser" + private_key = var.bastion_private_key_content + bastion_host = var.bastion_fip + bastion_user = "ubuntu" + bastion_private_key = var.bastion_private_key_content + timeout = "60m" + } + + provisioner "remote-exec" { + inline = [ + data.template_file.ldap_connection_script.rendered + ] + } +} diff --git a/modules/ldap_remote_exec/outputs.tf b/modules/ldap_remote_exec/outputs.tf new file mode 100644 index 00000000..e69de29b diff --git a/modules/ldap_remote_exec/validate_ldap_connection.tpl b/modules/ldap_remote_exec/validate_ldap_connection.tpl new file mode 100644 index 00000000..af072ae8 --- /dev/null +++ b/modules/ldap_remote_exec/validate_ldap_connection.tpl @@ -0,0 +1,17 @@ +#!/bin/bash + +# Validate connectivity to the specified LDAP server over port 389 +# Uses OpenSSL to attempt a TCP connection and check for success + +# shellcheck disable=SC2154 +# Suppress warning for undefined variable (handled externally by templating engine) + +echo "Attempting to connect to LDAP server at ${ldap_server}:389..." + +if openssl s_client -connect "${ldap_server}:389" /dev/null | grep -q 'CONNECTED'; then + echo "✅ Successfully connected to LDAP server at ${ldap_server}:389." +else + echo "❌ Failed to connect to LDAP server at ${ldap_server}:389." + echo "Please ensure the server is reachable and listening on the expected port." + exit 1 +fi diff --git a/modules/null/ldap_remote_exec/variables.tf b/modules/ldap_remote_exec/variables.tf similarity index 56% rename from modules/null/ldap_remote_exec/variables.tf rename to modules/ldap_remote_exec/variables.tf index e3863bde..0d79e0fc 100644 --- a/modules/null/ldap_remote_exec/variables.tf +++ b/modules/ldap_remote_exec/variables.tf @@ -1,24 +1,20 @@ -variable "enable_ldap" { - type = bool - description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." -} - variable "ldap_server" { type = string description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." } -variable "login_host" { +variable "deployer_ip" { description = "Login host to be used for ssh connectivity." type = string } -variable "login_user" { - description = "Login user to be used for ssh connectivity." +variable "bastion_private_key_content" { + description = "Login private key to be used for ssh connectivity." type = string } -variable "login_private_key" { - description = "Login private key to be used for ssh connectivity." +variable "bastion_fip" { type = string + default = null + description = "deployer node ip" } diff --git a/modules/null/ldap_remote_exec/version.tf b/modules/ldap_remote_exec/version.tf similarity index 100% rename from modules/null/ldap_remote_exec/version.tf rename to modules/ldap_remote_exec/version.tf diff --git a/modules/my_ip/README.md b/modules/my_ip/README.md deleted file mode 100644 index dd08134b..00000000 --- a/modules/my_ip/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# This module will detect your current internet visible address. -# It can be useful if you want your IP to be added to the allowed_ips list, -# so you are sure you will be able to access the cluster. diff --git a/modules/my_ip/datasource.tf b/modules/my_ip/datasource.tf deleted file mode 100644 index 4e77519e..00000000 --- a/modules/my_ip/datasource.tf +++ /dev/null @@ -1,3 +0,0 @@ -data "external" "my_ipv4" { - program = ["bash", "-c", "echo '{\"ip\":\"'\"$(curl -4 http://ifconfig.io)\"'\"}'"] -} diff --git a/modules/my_ip/outputs.tf b/modules/my_ip/outputs.tf deleted file mode 100644 index c95e17f2..00000000 --- a/modules/my_ip/outputs.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "my_cidr" { - value = data.external.my_ipv4.result.ip != "" ? ["${data.external.my_ipv4.result.ip}/32"] : [] - description = "The IPv4 in CIDR format (a '/32' is appended)" -} diff --git a/modules/my_ip/version.tf b/modules/my_ip/version.tf deleted file mode 100644 index b62eadb6..00000000 --- a/modules/my_ip/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - external = { - source = "hashicorp/external" - version = "2.3.3" - } - } -} diff --git a/modules/null/ldap_remote_exec/main.tf b/modules/null/ldap_remote_exec/main.tf deleted file mode 100644 index e4b34c06..00000000 --- a/modules/null/ldap_remote_exec/main.tf +++ /dev/null @@ -1,20 +0,0 @@ -data "template_file" "ldap_connection_script" { - template = file("${path.module}/validate_ldap_connection.tpl") - vars = { - ldap_server = var.ldap_server - } -} - -# The resource is used to validate the existing LDAP server connection. -resource "null_resource" "validate_ldap_server_connection" { - count = var.enable_ldap == true && var.ldap_server != "null" ? 1 : 0 - connection { - type = "ssh" - user = var.login_user - private_key = var.login_private_key - host = var.login_host - } - provisioner "remote-exec" { - inline = [data.template_file.ldap_connection_script.rendered] - } -} diff --git a/modules/null/ldap_remote_exec/validate_ldap_connection.tpl b/modules/null/ldap_remote_exec/validate_ldap_connection.tpl deleted file mode 100644 index c765ac8b..00000000 --- a/modules/null/ldap_remote_exec/validate_ldap_connection.tpl +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2154 - -if openssl s_client -connect "${ldap_server}:389" /dev/null | grep -q 'CONNECTED'; then - echo "The connection to the existing LDAP server ${ldap_server} was successfully established." -else - echo "The connection to the existing LDAP server ${ldap_server} failed, please establish it." - exit 1 -fi diff --git a/modules/null/local_exec/main.tf b/modules/null/local_exec/main.tf deleted file mode 100644 index b9351a60..00000000 --- a/modules/null/local_exec/main.tf +++ /dev/null @@ -1,18 +0,0 @@ -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -/* - This module used to run null for IBM Cloud CLIs -*/ - -resource "null_resource" "local_exec" { - provisioner "local-exec" { - command = "ibmcloud login --apikey ${var.ibmcloud_api_key} -r ${var.region}; ${var.command}" - } - - triggers = { - value = var.trigger_resource_id - } -} diff --git a/modules/null/local_exec/variables.tf b/modules/null/local_exec/variables.tf deleted file mode 100644 index 1cf7a665..00000000 --- a/modules/null/local_exec/variables.tf +++ /dev/null @@ -1,20 +0,0 @@ -variable "region" { - description = "The region with which IBM CLI login should happen." - type = string -} - -variable "ibmcloud_api_key" { - description = "IBM Cloud API key for the IBM Cloud account where the IBM Cloud HPC cluster needs to be deployed. For more information on how to create an API key, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." - type = string - sensitive = true -} - -variable "command" { - description = "This is the command to execute." - type = string -} - -variable "trigger_resource_id" { - description = "A map of arbitrary strings that, when changed, will force the null resource to be replaced, re-running any associated provisioners." - type = any -} diff --git a/modules/null/local_exec/version.tf b/modules/null/local_exec/version.tf deleted file mode 100644 index 6f7592be..00000000 --- a/modules/null/local_exec/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - null = { - source = "hashicorp/null" - version = ">= 3.0.0" - } - } -} diff --git a/modules/null/local_exec_script/main.tf b/modules/null/local_exec_script/main.tf deleted file mode 100644 index 40d21752..00000000 --- a/modules/null/local_exec_script/main.tf +++ /dev/null @@ -1,6 +0,0 @@ -resource "null_resource" "execute_local_script" { - provisioner "local-exec" { - command = "${var.script_path} ${var.script_arguments}" - environment = var.script_environment - } -} diff --git a/modules/null/local_exec_script/outputs.tf b/modules/null/local_exec_script/outputs.tf deleted file mode 100644 index bdb2b8cd..00000000 --- a/modules/null/local_exec_script/outputs.tf +++ /dev/null @@ -1 +0,0 @@ -# This empty file exists to suppress TFLint Warning on the terraform_standard_module_structure diff --git a/modules/null/local_exec_script/variables.tf b/modules/null/local_exec_script/variables.tf deleted file mode 100644 index 582ffe48..00000000 --- a/modules/null/local_exec_script/variables.tf +++ /dev/null @@ -1,14 +0,0 @@ -variable "script_path" { - description = "The path to the script to execute" - type = string -} - -variable "script_arguments" { - description = "The arguments to pass to the script" - type = string -} - -variable "script_environment" { - description = "The environment variables to pass to the script" - type = map(string) -} diff --git a/modules/null/local_exec_script/version.tf b/modules/null/local_exec_script/version.tf deleted file mode 100644 index 6f7592be..00000000 --- a/modules/null/local_exec_script/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - null = { - source = "hashicorp/null" - version = ">= 3.0.0" - } - } -} diff --git a/modules/null/remote_exec/main.tf b/modules/null/remote_exec/main.tf deleted file mode 100644 index 02147d34..00000000 --- a/modules/null/remote_exec/main.tf +++ /dev/null @@ -1,30 +0,0 @@ -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -/* - This module used to run null for LSF utilities -*/ - -resource "null_resource" "remote_exec" { - count = length(var.cluster_host) - connection { - type = "ssh" - host = var.cluster_host[count.index] - user = var.cluster_user - private_key = var.cluster_private_key - bastion_host = var.login_host - bastion_user = var.login_user - bastion_private_key = var.login_private_key - timeout = var.timeout - } - - provisioner "remote-exec" { - inline = var.command - } - - triggers = { - build = timestamp() - } -} diff --git a/modules/null/remote_exec/variables.tf b/modules/null/remote_exec/variables.tf deleted file mode 100644 index aa5178e0..00000000 --- a/modules/null/remote_exec/variables.tf +++ /dev/null @@ -1,40 +0,0 @@ -variable "cluster_host" { - description = "Cluster hosts to be used for ssh connectivity." - type = list(string) -} - -variable "cluster_user" { - description = "Cluster user to be used for ssh connectivity." - type = string -} - -variable "cluster_private_key" { - description = "Cluster private key to be used for ssh connectivity." - type = string -} - -variable "login_host" { - description = "Login host to be used for ssh connectivity." - type = string -} - -variable "login_user" { - description = "Login user to be used for ssh connectivity." - type = string -} - -variable "login_private_key" { - description = "Login private key to be used for ssh connectivity." - type = string -} - -variable "command" { - description = "These are the list of commands to execute." - type = list(string) -} - -variable "timeout" { - description = "Timeout for connection attempts." - type = string - default = "5m" -} diff --git a/modules/null/remote_exec/version.tf b/modules/null/remote_exec/version.tf deleted file mode 100644 index 6f7592be..00000000 --- a/modules/null/remote_exec/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - null = { - source = "hashicorp/null" - version = ">= 3.0.0" - } - } -} diff --git a/modules/null/remote_exec_script/locals.tf b/modules/null/remote_exec_script/locals.tf deleted file mode 100644 index a1ba863f..00000000 --- a/modules/null/remote_exec_script/locals.tf +++ /dev/null @@ -1,4 +0,0 @@ -locals { - command_1 = "sh -c \"cd /tmp && ${var.with_bash ? "bash " : ""}${var.script_to_run}\"" - final_command = "${var.sudo_user != "" ? "sudo -i -u ${var.sudo_user} -- " : ""}${local.command_1}" -} diff --git a/modules/null/remote_exec_script/main.tf b/modules/null/remote_exec_script/main.tf deleted file mode 100644 index 52a3019e..00000000 --- a/modules/null/remote_exec_script/main.tf +++ /dev/null @@ -1,126 +0,0 @@ -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -/* - This module used to run null for LSF utilities -*/ - -resource "null_resource" "remote_exec_script_cp_files" { - count = length(var.cluster_host) * length(var.payload_files) - - provisioner "file" { - connection { - type = "ssh" - host = var.cluster_host[floor(count.index / length(var.payload_files))] - user = var.cluster_user - private_key = var.cluster_private_key - bastion_host = var.login_host - bastion_user = var.login_user - bastion_private_key = var.login_private_key - } - source = var.payload_files[count.index % length(var.payload_files)] - destination = "/tmp/${basename(var.payload_files[count.index % length(var.payload_files)])}" - } - - provisioner "local-exec" { - when = destroy - command = "true" - } - - triggers = { - trigger_string = var.trigger_string - } -} - -resource "null_resource" "remote_exec_script_cp_dirs" { - count = length(var.cluster_host) * length(var.payload_dirs) - - provisioner "file" { - connection { - type = "ssh" - host = var.cluster_host[floor(count.index / length(var.payload_dirs))] - user = var.cluster_user - private_key = var.cluster_private_key - bastion_host = var.login_host - bastion_user = var.login_user - bastion_private_key = var.login_private_key - } - source = var.payload_dirs[count.index % length(var.payload_dirs)] - destination = "/tmp/" - } - - provisioner "local-exec" { - when = destroy - command = "true" - } - - triggers = { - trigger_string = var.trigger_string - } -} - -resource "null_resource" "remote_exec_script_new_file" { - count = var.new_file_name != "" ? length(var.cluster_host) : 0 - - provisioner "file" { - connection { - type = "ssh" - host = var.cluster_host[count.index] - user = var.cluster_user - private_key = var.cluster_private_key - bastion_host = var.login_host - bastion_user = var.login_user - bastion_private_key = var.login_private_key - } - content = var.new_file_content - destination = "/tmp/${var.new_file_name}" - } - - provisioner "local-exec" { - when = destroy - command = "true" - } - - depends_on = [ - null_resource.remote_exec_script_cp_dirs # we may want to create the file in a subpath created with the cp_dirs - ] - triggers = { - trigger_string = var.trigger_string - } -} - -resource "null_resource" "remote_exec_script_run" { - count = length(var.cluster_host) - - provisioner "remote-exec" { - connection { - type = "ssh" - host = var.cluster_host[count.index] - user = var.cluster_user - private_key = var.cluster_private_key - bastion_host = var.login_host - bastion_user = var.login_user - bastion_private_key = var.login_private_key - } - inline = [ - "sh -c \"chmod +x /tmp/${var.script_to_run}\"", - "cd /tmp && ${local.final_command}" - ] - } - - provisioner "local-exec" { - when = destroy - command = "true" - } - - depends_on = [ - null_resource.remote_exec_script_cp_files, - null_resource.remote_exec_script_cp_dirs, - null_resource.remote_exec_script_new_file - ] - triggers = { - trigger_string = var.trigger_string - } -} diff --git a/modules/null/remote_exec_script/outputs.tf b/modules/null/remote_exec_script/outputs.tf deleted file mode 100644 index 21f87cdd..00000000 --- a/modules/null/remote_exec_script/outputs.tf +++ /dev/null @@ -1,9 +0,0 @@ -output "command_1" { - description = "Command for reference" - value = local.command_1 -} - -output "final_command" { - description = "Command for reference" - value = local.final_command -} diff --git a/modules/null/remote_exec_script/variables.tf b/modules/null/remote_exec_script/variables.tf deleted file mode 100644 index 79d0e9e5..00000000 --- a/modules/null/remote_exec_script/variables.tf +++ /dev/null @@ -1,76 +0,0 @@ -variable "cluster_host" { - description = "Cluster hosts to be used for ssh connectivity." - type = list(string) -} - -variable "cluster_user" { - description = "Cluster user to be used for ssh connectivity." - type = string -} - -variable "cluster_private_key" { - description = "Cluster private key to be used for ssh connectivity." - type = string -} - -variable "login_host" { - description = "Login host to be used for ssh connectivity." - type = string -} - -variable "login_user" { - description = "Login user to be used for ssh connectivity." - type = string -} - -variable "login_private_key" { - description = "Login private key to be used for ssh connectivity." - type = string -} - -variable "payload_files" { - description = "List of files that are to be transferred." - type = list(string) - default = [] -} - -variable "payload_dirs" { - description = "List of directories that are to be transferred." - type = list(string) - default = [] -} - -variable "new_file_name" { - description = "File name to be created." - type = string - default = "" -} - -variable "new_file_content" { - description = "Content of file to be created." - type = string - default = "" -} - -variable "script_to_run" { - description = "Name of script to be run." - type = string -} - -variable "sudo_user" { - description = "User we want to sudo to (e.g. 'root')." - type = string - default = "" -} - -variable "with_bash" { - description = "If we want a 'bash -c' execution of the script." - type = bool - default = false -} - -variable "trigger_string" { - description = "Changing this string will trigger a re-run" - type = string - default = "" -} diff --git a/modules/null/remote_exec_script/version.tf b/modules/null/remote_exec_script/version.tf deleted file mode 100644 index 6f7592be..00000000 --- a/modules/null/remote_exec_script/version.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - null = { - source = "hashicorp/null" - version = ">= 3.0.0" - } - } -} diff --git a/modules/observability_instance/datasources.tf b/modules/observability_instance/datasources.tf index f431ff4e..6be8d171 100644 --- a/modules/observability_instance/datasources.tf +++ b/modules/observability_instance/datasources.tf @@ -7,7 +7,7 @@ data "http" "sysdig_prws_key" { # Optional request headers request_headers = { Accept = "application/json" - Authorization = sensitive(data.ibm_iam_auth_token.tokendata.iam_access_token) # <--- Wrap this + Authorization = data.ibm_iam_auth_token.tokendata.iam_access_token IBMInstanceID = var.cloud_monitoring_provision ? module.observability_instance.cloud_monitoring_guid : "" } } diff --git a/modules/observability_instance/main.tf b/modules/observability_instance/main.tf index 43568f7e..49477413 100644 --- a/modules/observability_instance/main.tf +++ b/modules/observability_instance/main.tf @@ -1,7 +1,7 @@ # This module requires additional logdna provider configuration blocks locals { cloud_monitoring_instance_name = var.cloud_monitoring_instance_name - logs_instance_endpoint = "https://api.${var.location}.logging.cloud.ibm.com" + # logs_instance_endpoint = "https://api.${var.location}.logging.cloud.ibm.com" } module "observability_instance" { @@ -21,13 +21,13 @@ module "observability_instance" { # logs and metrics buckets must be different logs_data = { enabled = true - bucket_crn = var.cloud_logs_data_bucket != null ? var.cloud_logs_data_bucket["crn"] : "" - bucket_endpoint = var.cloud_logs_data_bucket != null ? var.cloud_logs_data_bucket["s3_endpoint_direct"] : "" + bucket_crn = var.cloud_logs_data_bucket != null ? var.cloud_logs_data_bucket["bucket_crn"] : "" + bucket_endpoint = var.cloud_logs_data_bucket != null ? var.cloud_logs_data_bucket["bucket_endpoint"] : "" }, metrics_data = { enabled = true - bucket_crn = var.cloud_metrics_data_bucket != null ? var.cloud_metrics_data_bucket["crn"] : "" - bucket_endpoint = var.cloud_metrics_data_bucket != null ? var.cloud_metrics_data_bucket["s3_endpoint_direct"] : "" + bucket_crn = var.cloud_metrics_data_bucket != null ? var.cloud_metrics_data_bucket["bucket_crn"] : "" + bucket_endpoint = var.cloud_metrics_data_bucket != null ? var.cloud_metrics_data_bucket["bucket_endpoint"] : "" } } activity_tracker_routes = var.cloud_logs_as_atracker_target ? [ diff --git a/modules/observability_instance/outputs.tf b/modules/observability_instance/outputs.tf index 523a668e..29c45608 100644 --- a/modules/observability_instance/outputs.tf +++ b/modules/observability_instance/outputs.tf @@ -35,6 +35,11 @@ output "cloud_monitoring_url" { description = "IBM Cloud Monitoring URL" } +output "cloud_monitoring_crn" { + value = var.cloud_monitoring_provision ? module.observability_instance.cloud_monitoring_crn : null + description = "IBM Cloud Monitoring URL" +} + output "cloud_logs_url" { value = var.cloud_logs_provision ? "https://dashboard.${var.location}.logs.cloud.ibm.com/${module.observability_instance.cloud_logs_guid}" : null description = "IBM Cloud Logs URL" diff --git a/modules/observability_instance/providers.tf b/modules/observability_instance/providers.tf deleted file mode 100644 index 550e085d..00000000 --- a/modules/observability_instance/providers.tf +++ /dev/null @@ -1,23 +0,0 @@ -provider "logdna" { - alias = "ats" - servicekey = module.observability_instance.activity_tracker_ats_resource_key != null ? module.observability_instance.activity_tracker_ats_resource_key : "" - url = local.logs_instance_endpoint -} - -provider "logdna" { - alias = "sts" - servicekey = module.observability_instance.log_analysis_sts_resource_key != null ? module.observability_instance.log_analysis_sts_resource_key : "" - url = local.logs_instance_endpoint -} - -provider "logdna" { - alias = "at" - servicekey = module.observability_instance.activity_tracker_resource_key != null ? module.observability_instance.activity_tracker_resource_key : "" - url = local.logs_instance_endpoint -} - -provider "logdna" { - alias = "ld" - servicekey = module.observability_instance.log_analysis_resource_key != null ? module.observability_instance.log_analysis_resource_key : "" - url = local.logs_instance_endpoint -} diff --git a/modules/observability_instance/versions.tf b/modules/observability_instance/versions.tf index 882585d2..5cf4da6c 100644 --- a/modules/observability_instance/versions.tf +++ b/modules/observability_instance/versions.tf @@ -5,11 +5,6 @@ terraform { source = "IBM-Cloud/ibm" version = ">= 1.56.2" } - logdna = { - source = "logdna/logdna" - version = ">= 1.14.2" - configuration_aliases = [logdna.ats, logdna.sts, logdna.at, logdna.ld] - } http = { source = "hashicorp/http" version = ">= 2.0.0" diff --git a/modules/playbook/main.tf b/modules/playbook/main.tf new file mode 100644 index 00000000..00f34915 --- /dev/null +++ b/modules/playbook/main.tf @@ -0,0 +1,640 @@ +locals { + proxyjump = var.enable_deployer ? "-o ProxyJump=ubuntu@${var.bastion_fip}" : "" + common_config_playbook = format("%s/common_config_playbook.yml", var.playbooks_path) + pre_lsf_config_playbook = format("%s/pre_lsf_config_playbook.yml", var.playbooks_path) + login_node_playbook = format("%s/login_node_configuration.yml", var.playbooks_path) + lsf_post_config_playbook = format("%s/lsf_post_config_playbook.yml", var.playbooks_path) + ldap_server_inventory = format("%s/ldap_server_inventory.ini", var.playbooks_path) + configure_ldap_client = format("%s/configure_ldap_client.yml", var.playbooks_path) + prepare_ldap_server = format("%s/prepare_ldap_server.yml", var.playbooks_path) + deployer_hostentry_playbook_path = format("%s/deployer_host_entry_play.yml", var.playbooks_path) + lsf_hostentry_playbook_path = format("%s/lsf_host_entry_play.yml", var.playbooks_path) + remove_hostentry_playbooks_path = format("%s/remove_host_entry_play.yml", var.playbooks_path) + lsf_prerequesite_playbook_path = format("%s/lsf_prerequesite_play.yml", var.playbooks_path) + deployer_host = jsonencode(var.deployer_host) + mgmnt_hosts = jsonencode(var.mgmnt_hosts) + comp_hosts = jsonencode(var.comp_hosts) + login_host = jsonencode(var.login_host) + # domain_name = var.domain_name +} + +resource "local_file" "deployer_host_entry_play" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = <- + {{ {} | combine(mgmnt_hosts | from_json, comp_hosts | from_json, login_host | from_json) }} + + - name: Invert mapping to ensure 1 hostname = 1 IP (latest IP kept) + ansible.builtin.set_fact: + hostname_map: >- + {{ + all_hosts + | dict2items + | reverse + | items2dict(key_name='value', value_name='key') + }} + + - name: Generate managed block content + ansible.builtin.set_fact: + managed_block: | + {% for hostname, ip in hostname_map.items() -%} + {{ ip }} {{ hostname }} {{ hostname }}.{{ domain_name }} + {% endfor %} + + - name: Update /etc/hosts with managed entries + ansible.builtin.blockinfile: + path: "{{ hosts_file }}" + marker: "# === ANSIBLE MANAGED HOSTS {mark} ===" + block: "{{ managed_block }}" + + - name: Insert Create folder and Ensure js.conf lines + ansible.builtin.blockinfile: + path: /opt/ibm/lsf_installer/playbook/roles/deploy-gui/tasks/configure_pm_common.yml + marker: "# {mark} MANAGED BLOCK FOR PM_CONF_DIR" + insertbefore: "^\\- name: Update JS_HOST" + block: | + {% raw %} + - name: Create folder from PM_CONF_DIR variable + file: + path: "{{ PM_CONF_DIR }}" + state: directory + mode: '0755' + - name: Ensure js.conf file exists + file: + path: "{{ PM_CONF_DIR }}/js.conf" + state: touch + mode: '0644' + {% endraw %} +EOT + filename = local.deployer_hostentry_playbook_path +} + +resource "null_resource" "deploy_host_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -e 'mgmnt_hosts=${local.mgmnt_hosts}' -e 'comp_hosts=${local.comp_hosts}' -e 'login_host=${local.login_host}' -e 'domain_name=${var.domain_name}' '${local.deployer_hostentry_playbook_path}'" + } + + triggers = { + build = timestamp() + } + depends_on = [local_file.deployer_host_entry_play] +} + +resource "local_file" "lsf_host_entry_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + tasks: + + - name: Wait for SSH (retry 5x) + ansible.builtin.wait_for: + port: 22 + host: [all_nodes] + timeout: 60 + delay: 10 + retries: 5 + until: true + delegate_to: localhost + ignore_errors: yes + + - name: Check passwordless SSH on all scale inventory hosts + shell: echo PASSWDLESS_SSH_ENABLED + register: result + until: result.stdout.find("PASSWDLESS_SSH_ENABLED") != -1 + retries: 60 + delay: 10 + +- name: Manage /etc/hosts with dynamic host-IP mappings + hosts: all + become: yes + vars: + mgmnt_hosts: '{}' + comp_hosts: '{}' + login_host: '{}' + deployer_host: '{}' + hosts_file: /etc/hosts + + pre_tasks: + - name: Load cluster-specific variables + ansible.builtin.include_vars: + file: all.json + + tasks: + - name: Parse and merge host mappings + ansible.builtin.set_fact: + all_hosts: >- + {{ {} | combine(mgmnt_hosts | from_json, comp_hosts | from_json, login_host | from_json, deployer_host | from_json) }} + + - name: Invert mapping to ensure 1 hostname = 1 IP (latest IP kept) + ansible.builtin.set_fact: + hostname_map: >- + {{ + all_hosts + | dict2items + | reverse + | items2dict(key_name='value', value_name='key') + }} + + - name: Generate managed block content + ansible.builtin.set_fact: + managed_block: | + {% for hostname, ip in hostname_map.items() -%} + {{ ip }} {{ hostname }} {{ hostname }}.{{ domain_name }} + {% endfor %} + + - name: Update /etc/hosts with managed entries + ansible.builtin.blockinfile: + path: "{{ hosts_file }}" + marker: "# === ANSIBLE MANAGED HOSTS {mark} ===" + block: "{{ managed_block }}" +EOT + filename = local.lsf_hostentry_playbook_path +} + +resource "null_resource" "lsf_host_play" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 50 -e 'deployer_host=${local.deployer_host}' -e 'mgmnt_hosts=${local.mgmnt_hosts}' -e 'comp_hosts=${local.comp_hosts}' -e 'login_host=${local.login_host}' -e 'domain_name=${var.domain_name}' -i ${var.inventory_path} '${local.lsf_hostentry_playbook_path}'" + } + + triggers = { + build = timestamp() + } + depends_on = [null_resource.deploy_host_playbook, local_file.lsf_host_entry_playbook] +} + +resource "local_file" "create_common_config_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + pre_tasks: + - name: Load cluster-specific variables + include_vars: all.json + roles: + - { role: vpc_fileshare_config } + - { role: lsf_prereq_config } +EOT + filename = local.common_config_playbook +} + +resource "null_resource" "run_common_config_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${local.common_config_playbook}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.create_common_config_playbook, null_resource.lsf_host_play] +} + +resource "local_file" "create_pre_lsf_config_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + pre_tasks: + - name: Load cluster-specific variables + include_vars: all.json + roles: + - { role: lsf_template_config } +EOT + filename = local.pre_lsf_config_playbook +} + +resource "null_resource" "run_pre_lsf_config_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${local.pre_lsf_config_playbook}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.create_pre_lsf_config_playbook, null_resource.run_common_config_playbook] +} + +resource "local_file" "lsf_prerequesite_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" && var.enable_dedicated_host ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + pre_tasks: + - name: Load cluster-specific variables + include_vars: all.json + roles: + - { role: lsf_mgmt_config } +EOT + filename = var.lsf_mgmt_playbooks_path +} + + +resource "null_resource" "run_playbook_for_mgmt_config" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${var.lsf_mgmt_playbooks_path}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.create_playbook_for_mgmt_config, null_resource.run_lsf_playbooks] +} + +resource "local_file" "create_playbook_for_login_node_config" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + pre_tasks: + - name: Load cluster-specific variables + include_vars: all.json + roles: + - { role: lsf_login_config } +EOT + filename = local.login_node_playbook +} + + +resource "null_resource" "run_playbook_for_login_node_config" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${local.login_node_playbook}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.create_playbook_for_mgmt_config, null_resource.run_lsf_playbooks] +} + +resource "local_file" "create_playbook_for_post_deploy_config" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + pre_tasks: + - name: Load cluster-specific variables + include_vars: all.json + roles: + - { role: lsf_post_config } +EOT + filename = local.lsf_post_config_playbook +} + + +resource "null_resource" "run_playbook_post_deploy_config" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${local.lsf_post_config_playbook}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.create_playbook_for_post_deploy_config, null_resource.run_playbook_for_mgmt_config, null_resource.run_playbook_for_login_node_config] +} + +resource "local_file" "prepare_ldap_server_playbook" { + count = local.ldap_server_inventory != null && var.enable_ldap && var.ldap_server == "null" && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + roles: + - { role: ldap_server_prepare } +EOT + filename = local.prepare_ldap_server +} + +resource "null_resource" "configure_ldap_server_playbook" { + count = local.ldap_server_inventory != null && var.enable_ldap && var.ldap_server == "null" && var.scheduler == "LSF" ? 1 : 0 + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -i ${local.ldap_server_inventory} ${local.prepare_ldap_server}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.prepare_ldap_server_playbook] +} + +resource "local_file" "prepare_ldap_client_playbook" { + count = var.inventory_path != null && var.enable_ldap && var.scheduler == "LSF" ? 1 : 0 + content = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + roles: + - { role: ldap_client_config } +EOT + filename = local.configure_ldap_client +} + +resource "null_resource" "run_ldap_client_playbooks" { + count = var.inventory_path != null && var.enable_ldap && var.scheduler == "LSF" ? 1 : 0 + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${local.configure_ldap_client}" + } + triggers = { + build = timestamp() + } + depends_on = [local_file.prepare_ldap_client_playbook, null_resource.configure_ldap_server_playbook, null_resource.run_playbook_for_mgmt_config] +} + +resource "null_resource" "export_api" { + count = (var.cloudlogs_provision && var.scheduler == "LSF") || var.scheduler == "Scale" ? 1 : 0 + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = < + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + roles: + - { role: cloudlogs, tags: ["cloud_logs"] } + +- name: Cloud Monitoring Configuration + hosts: [mgmt_compute_nodes] + any_errors_fatal: true + gather_facts: true + vars: + ansible_ssh_common_args: > + ${local.proxyjump} + -o ControlMaster=auto + -o ControlPersist=30m + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + ansible_user: root + ansible_ssh_private_key_file: ${var.private_key_path} + roles: + - { role: cloudmonitoring, tags: ["cloud_monitoring"] } +EOT + filename = var.observability_playbook_path +} + +resource "null_resource" "run_observability_playbooks" { + count = var.inventory_path != null && var.observability_provision && var.scheduler == "LSF" ? 1 : 0 + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = "sudo ansible-playbook -f 200 -i ${var.inventory_path} ${var.observability_playbook_path}" + } + triggers = { + build = timestamp() + } + depends_on = [null_resource.export_api] +} + +resource "local_file" "remove_host_entry_playbook" { + count = var.inventory_path != null && var.scheduler == "LSF" ? 1 : 0 + content = < 0 ? var.dns_custom_resolver_id : null) : var.dns_custom_resolver_id) + dns_instance_id = jsonencode(var.dns_instance_id != null ? (length(var.dns_instance_id) > 0 ? var.dns_instance_id : null) : var.dns_instance_id) + list_ldap_instances = jsonencode(var.ldap_instance) + ldap_server = jsonencode(var.ldap_server) + ldap_basedns = jsonencode(var.ldap_basedns) + list_ldap_ssh_keys = jsonencode(var.ldap_instance_key_pair) + list_afm_instances = jsonencode(var.afm_instances) + afm_cos_config_details = jsonencode(var.afm_cos_config) + list_gklm_ssh_keys = jsonencode(var.gklm_instance_key_pair) + list_gklm_instances = jsonencode(var.gklm_instances) + scale_encryption_type = jsonencode(var.scale_encryption_type) + filesystem_config = jsonencode(var.filesystem_config) + scale_encryption_admin_password = jsonencode(var.scale_encryption_admin_password) + custom_file_shares = jsonencode(var.custom_file_shares) + resource_group_ids = jsonencode(var.resource_group_ids) + existing_bastion_instance_name = jsonencode(var.existing_bastion_instance_name == null ? null : var.existing_bastion_instance_name) + existing_bastion_security_group_id = jsonencode(var.existing_bastion_security_group_id == null ? null : var.existing_bastion_security_group_id) + login_instance = jsonencode(var.login_instance) + +} diff --git a/modules/prepare_tf_input/main.tf b/modules/prepare_tf_input/main.tf new file mode 100644 index 00000000..2f45d195 --- /dev/null +++ b/modules/prepare_tf_input/main.tf @@ -0,0 +1,93 @@ +resource "local_sensitive_file" "prepare_tf_input" { + count = var.enable_deployer == true ? 1 : 0 + content = < instance_details.address }, {}) + description = "Instance name and ip map" + depends_on = [ibm_dns_resource_record.a_itself, ibm_dns_resource_record.ptr_itself] +} + +output "reserved_ip_id_ip_map" { + value = try({ for reserved_ip_details in ibm_is_subnet_reserved_ip.itself : reserved_ip_details.name => reserved_ip_details.reserved_ip }, {}) + description = "Reserved name and its ip map" + depends_on = [ibm_dns_resource_record.a_itself, ibm_dns_resource_record.ptr_itself] +} diff --git a/modules/protocol_reserved_ip/protocol_reserved_ip.tf b/modules/protocol_reserved_ip/protocol_reserved_ip.tf new file mode 100644 index 00000000..7a1fb425 --- /dev/null +++ b/modules/protocol_reserved_ip/protocol_reserved_ip.tf @@ -0,0 +1,68 @@ +/* + Creates IBM Cloud reserved ip for protocol nodes. +*/ +# Subnet ID with address, name and auto_delete +resource "ibm_is_subnet_reserved_ip" "itself" { + for_each = { + # This assigns a subnet-id to each of the instance + # iteration. + for idx, count_number in range(1, var.total_reserved_ips + 1) : idx => { + sequence_string = tostring(count_number) + subnet_id = element(var.subnet_id, idx) + } + } + + subnet = each.value.subnet_id + name = format("%s-%s-res", var.name, each.value.sequence_string) +} + +resource "ibm_dns_resource_record" "a_itself" { + for_each = { + for idx, count_number in range(1, var.total_reserved_ips + 1) : idx => { + name = element(tolist([for name_details in ibm_is_subnet_reserved_ip.itself : name_details.name]), idx) + network_ip = element(tolist([for ip_details in ibm_is_subnet_reserved_ip.itself : ip_details.address]), idx) + } + } + + instance_id = var.protocol_dns_service_id + zone_id = var.protocol_dns_zone_id + type = "A" + name = each.value.name + rdata = each.value.network_ip + ttl = 300 + depends_on = [ibm_is_subnet_reserved_ip.itself] +} + +resource "ibm_dns_resource_record" "b_itself" { + for_each = { + for idx, count_number in range(1, var.total_reserved_ips + 1) : idx => { + name = element(tolist([for name_details in ibm_is_subnet_reserved_ip.itself : var.name]), idx) + network_ip = element(tolist([for ip_details in ibm_is_subnet_reserved_ip.itself : ip_details.address]), idx) + } + } + + instance_id = var.protocol_dns_service_id + zone_id = var.protocol_dns_zone_id + type = "A" + name = each.value.name + rdata = each.value.network_ip + ttl = 300 + depends_on = [ibm_is_subnet_reserved_ip.itself] +} + +resource "ibm_dns_resource_record" "ptr_itself" { + for_each = { + for idx, count_number in range(1, var.total_reserved_ips + 1) : idx => { + name = element(tolist([for name_details in ibm_is_subnet_reserved_ip.itself : name_details.name]), idx) + network_ip = element(tolist([for ip_details in ibm_is_subnet_reserved_ip.itself : ip_details.address]), idx) + } + } + + instance_id = var.protocol_dns_service_id + zone_id = var.protocol_dns_zone_id + type = "PTR" + name = each.value.network_ip + rdata = format("%s.%s", each.value.name, var.protocol_domain) + ttl = 300 + depends_on = [ibm_dns_resource_record.a_itself] +} diff --git a/modules/protocol_reserved_ip/variables.tf b/modules/protocol_reserved_ip/variables.tf new file mode 100644 index 00000000..c9456e57 --- /dev/null +++ b/modules/protocol_reserved_ip/variables.tf @@ -0,0 +1,29 @@ +variable "total_reserved_ips" { + type = number + description = "Total number of reserved ips." +} + +variable "subnet_id" { + type = list(string) + description = "Protocol subnet id." +} + +variable "name" { + type = string + description = "Name of reserved ips." +} + +variable "protocol_domain" { + type = string + description = "Protocol DNS service id." +} + +variable "protocol_dns_service_id" { + type = string + description = "Protocol domain name." +} + +variable "protocol_dns_zone_id" { + type = string + description = "Protocol DNS zone id." +} diff --git a/modules/protocol_reserved_ip/version.tf b/modules/protocol_reserved_ip/version.tf new file mode 100644 index 00000000..913bf325 --- /dev/null +++ b/modules/protocol_reserved_ip/version.tf @@ -0,0 +1,22 @@ +############################################################################## +# Terraform Providers +############################################################################## + +terraform { + required_version = ">= 1.3" + # Use "greater than or equal to" range for root level modules + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + # local = { + # source = "hashicorp/local" + # version = "~> 2" + # } + # ansible = { + # source = "ansible/ansible" + # version = "~> 1.3.0" + # } + } +} diff --git a/modules/resource_provisioner/locals.tf b/modules/resource_provisioner/locals.tf new file mode 100644 index 00000000..ac8e6aa9 --- /dev/null +++ b/modules/resource_provisioner/locals.tf @@ -0,0 +1,15 @@ +locals { + schematics_inputs_path = format("/tmp/.schematics/%s/solution_terraform.auto.tfvars.json", var.cluster_prefix) + remote_inputs_path = format("%s/terraform.tfvars.json", "/tmp") + deployer_path = "/opt/ibm" + remote_terraform_path = format("%s/terraform-ibm-hpc", local.deployer_path) + da_hpc_repo_url = "github.com/terraform-ibm-modules/terraform-ibm-hpc.git" + da_hpc_repo_tag = "lsf-da-20-june" ###### change it to main in future + remote_ansible_path = format("%s/ibm-spectrumscale-cloud-deploy", local.deployer_path) + scale_cloud_infra_repo_url = "https://github.com/jayeshh123/ibm-spectrum-scale-install-infra" + scale_cloud_infra_repo_name = "ibm-spectrum-scale-install-infra" + scale_cloud_infra_repo_tag = "jay_scale_da_api" + products = var.scheduler == "Scale" ? "scale" : "lsf" + ssh_key_file = "${path.root}/../../solutions/${local.products}/bastion_id_rsa" + bastion_public_key_content = var.existing_bastion_instance_name != null ? var.bastion_public_key_content : "" +} diff --git a/modules/resource_provisioner/main.tf b/modules/resource_provisioner/main.tf new file mode 100644 index 00000000..67a491cd --- /dev/null +++ b/modules/resource_provisioner/main.tf @@ -0,0 +1,132 @@ +resource "null_resource" "tf_resource_provisioner" { + count = var.enable_deployer == true ? 1 : 0 + connection { + type = "ssh" + host = var.deployer_ip + user = "vpcuser" + private_key = var.bastion_private_key_content + bastion_host = var.bastion_fip + bastion_user = "ubuntu" + bastion_private_key = var.bastion_private_key_content + timeout = "60m" + } + + provisioner "file" { + source = local.schematics_inputs_path + destination = local.remote_inputs_path + } + + provisioner "remote-exec" { + inline = [ + # Remove and re-clone the remote terraform path repo + # "if [ -d ${local.remote_terraform_path} ]; then echo 'Removing existing repository at ${local.remote_terraform_path}' && sudo rm -rf ${local.remote_terraform_path}; fi", + # "echo 'Cloning repository with tag: ${local.da_hpc_repo_tag}' && sudo git clone -b ${local.da_hpc_repo_tag} https://${var.github_token}@${local.da_hpc_repo_url} ${local.remote_terraform_path}", + "if [ ! -d ${local.remote_terraform_path} ]; then echo 'Cloning repository with tag: ${local.da_hpc_repo_tag}' && sudo git clone -b ${local.da_hpc_repo_tag} https://${local.da_hpc_repo_url} ${local.remote_terraform_path}; fi", + + # Clone Spectrum Scale collection if it doesn't exist + "if [ ! -d ${local.remote_ansible_path}/${local.scale_cloud_infra_repo_name}/collections/ansible_collections/ibm/spectrum_scale ]; then sudo git clone -b ${local.scale_cloud_infra_repo_tag} ${local.scale_cloud_infra_repo_url} ${local.remote_ansible_path}/${local.scale_cloud_infra_repo_name}/collections/ansible_collections/ibm/spectrum_scale; fi", + + # Ensure ansible-playbook is available + "sudo ln -fs /usr/local/bin/ansible-playbook /usr/bin/ansible-playbook", + + # Copy inputs file + "sudo cp ${local.remote_inputs_path} ${local.remote_terraform_path}", + + # Run Terraform init and apply + "export TF_LOG=${var.TF_LOG} && sudo -E terraform -chdir=${local.remote_terraform_path} init && sudo -E terraform -chdir=${local.remote_terraform_path} apply -parallelism=${var.TF_PARALLELISM} -auto-approve -lock=false" + ] + } + + triggers = { + always_run = timestamp() + } +} + +resource "null_resource" "ext_bastion_access" { + count = var.enable_deployer && var.existing_bastion_instance_name != null ? 1 : 0 + + connection { + type = "ssh" + host = var.bastion_fip + user = "ubuntu" + private_key = var.bastion_private_key_content + timeout = "60m" + } + + provisioner "remote-exec" { + inline = [ + "echo 'Adding SSH Key to Existing Bastion Host'", + sensitive("echo '${local.bastion_public_key_content}' >> /home/$(whoami)/.ssh/authorized_keys"), + ] + } +} + +resource "null_resource" "fetch_host_details_from_deployer" { + count = var.enable_deployer == true && var.scheduler == "LSF" ? 1 : 0 + + provisioner "local-exec" { + command = < 0 ? module.app_config[0].app_config_crn : null + scc_workload_protection_trusted_profile_name = var.cspm_enabled == true ? var.scc_workload_protection_trusted_profile_name : "workload-protection-trusted-profile" +} diff --git a/modules/security/sccwp/outputs.tf b/modules/security/sccwp/outputs.tf new file mode 100644 index 00000000..8b7b85f5 --- /dev/null +++ b/modules/security/sccwp/outputs.tf @@ -0,0 +1,4 @@ +output "app_config_crn" { + description = "app config crn" + value = length(module.app_config) > 0 ? module.app_config[0].app_config_crn : null +} diff --git a/modules/security/sccwp/variables.tf b/modules/security/sccwp/variables.tf new file mode 100644 index 00000000..9d4ed49a --- /dev/null +++ b/modules/security/sccwp/variables.tf @@ -0,0 +1,142 @@ +variable "region" { + description = "IBM Cloud region where all resources will be deployed" + type = string + default = "us-south" +} + +variable "resource_group_name" { + description = "The resource group ID where resources will be provisioned." + type = string +} + +variable "prefix" { + description = "The name to give the SCC Workload Protection instance that will be provisioned by this module." + type = string +} + +variable "sccwp_service_plan" { + description = "IBM service pricing plan." + type = string + default = "free-trial" + validation { + error_message = "Plan for SCC Workload Protection instances can only be `free-trial` or `graduated-tier`." + condition = contains( + ["free-trial", "graduated-tier"], + var.sccwp_service_plan + ) + } +} + +variable "app_config_plan" { + description = "IBM service pricing plan." + type = string + default = "basic" + validation { + error_message = "Plan for App configuration can only be basic, lite, standardv2, enterprise.." + condition = contains( + ["basic", "lite", "standardv2", "enterprise"], + var.app_config_plan + ) + } +} + +variable "resource_tags" { + type = list(string) + description = "Optional list of tags to be added to created SCC WP instance." + default = [] +} + +# variable "access_tags" { +# type = list(string) +# description = "A list of access tags to apply to the SCC WP instance created by the module. For more information, see https://cloud.ibm.com/docs/account?topic=account-access-tags-tutorial." +# default = [] + +# validation { +# condition = alltrue([ +# for tag in var.access_tags : can(regex("[\\w\\-_\\.]+:[\\w\\-_\\.]+", tag)) && length(tag) <= 128 +# ]) +# error_message = "Tags must match the regular expression \"[\\w\\-_\\.]+:[\\w\\-_\\.]+\", see https://cloud.ibm.com/docs/account?topic=account-tag&interface=ui#limits for more details" +# } +# } + +# variable "resource_key_name" { +# type = string +# description = "The name to give the IBM Cloud SCC WP resource key." +# default = "SCCWPManagerKey" +# } + +# variable "resource_key_tags" { +# type = list(string) +# description = "Tags associated with the IBM Cloud SCC WP resource key." +# default = [] +# } + +variable "cspm_enabled" { + description = "Enable Cloud Security Posture Management (CSPM) for the Workload Protection instance. This will create a trusted profile associated with the SCC Workload Protection instance that has viewer / reader access to the App Config service and viewer access to the Enterprise service. [Learn more](https://cloud.ibm.com/docs/workload-protection?topic=workload-protection-about)." + type = bool + default = false + nullable = false +} + +#variable "app_config_crn" { +# description = "The CRN of an existing App Config instance to use with the SCC Workload Protection instance. Required if `cspm_enabled` is true. NOTE: Ensure the App Config instance has configuration aggregator enabled." +# type = string +# default = null +# validation { +# condition = var.cspm_enabled ? var.app_config_crn != null : true +# error_message = "Cannot be `null` if CSPM is enabled." +# } +# validation { +# condition = anytrue([ +# can(regex("^crn:(.*:){3}apprapp:(.*:){2}[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}::$", var.app_config_crn)), +# var.app_config_crn == null, +# ]) +# error_message = "The provided CRN is not a valid App Config CRN." +# } +#} + +variable "scc_workload_protection_trusted_profile_name" { + description = "The name to give the trusted profile that is created by this module if `cspm_enabled` is `true. Must begin with a letter." + type = string + default = "workload-protection-trusted-profile" + validation { + condition = can(regex("^[a-zA-Z][a-zA-Z0-9\\-_\\.]+$", var.scc_workload_protection_trusted_profile_name)) + error_message = "The trusted profile name must begin with a letter and can only contain letters, numbers, hyphens, underscores, and periods." + } + validation { + condition = !(var.cspm_enabled && var.scc_workload_protection_trusted_profile_name == null) + error_message = "Cannot be `null` if `cspm_enabled` is `true`." + } +} + +# variable "cbr_rules" { +# type = list(object({ +# description = string +# account_id = string +# tags = optional(list(object({ +# name = string +# value = string +# })), []) +# rule_contexts = list(object({ +# attributes = optional(list(object({ +# name = string +# value = string +# }))) })) +# enforcement_mode = string +# })) +# description = "The list of context-based restriction rules to create." +# default = [] +# # Validation happens in the rule module +# } + +variable "enable_deployer" { + type = bool + default = true + description = "Deployer should be only used for better deployment performance" +} + +variable "sccwp_enable" { + type = bool + default = true + description = "Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created." +} diff --git a/modules/security/sccwp/versions.tf b/modules/security/sccwp/versions.tf new file mode 100644 index 00000000..5edf2175 --- /dev/null +++ b/modules/security/sccwp/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + # Use "greater than or equal to" range in modules + ibm = { + source = "ibm-cloud/ibm" + version = ">= 1.70.0, <2.0.0" + } + restapi = { + source = "Mastercard/restapi" + version = ">=2.0.1, <3.0.0" + } + } +} diff --git a/modules/write_inventory/datasource.tf b/modules/write_inventory/datasource.tf new file mode 100644 index 00000000..fa155c3d --- /dev/null +++ b/modules/write_inventory/datasource.tf @@ -0,0 +1,8 @@ +data "ibm_is_instance_profile" "dynamic_worker_profile" { + name = var.dynamic_compute_instances[0].profile +} + +data "ibm_is_image" "dynamic_compute" { + count = local.compute_image_found_in_map ? 0 : 1 + name = var.dynamic_compute_instances[0].image +} diff --git a/modules/write_inventory/image_map.tf b/modules/write_inventory/image_map.tf new file mode 100644 index 00000000..f58ee9d7 --- /dev/null +++ b/modules/write_inventory/image_map.tf @@ -0,0 +1,52 @@ +locals { + image_region_map = { + "hpc-lsf-fp15-rhel810-v1" = { + "eu-es" = "r050-deeeb734-2523-4aff-96e3-2be8d2b0d634" + "eu-gb" = "r018-8edcd9a1-dbca-462f-bf74-017c15ca4b71" + "eu-de" = "r010-394c5295-1704-4066-b57e-ae9bca1968de" + "us-east" = "r014-1777cdcb-8a68-4ef0-becf-84ec0d2e9a26" + "us-south" = "r006-40caf671-28a8-42c5-b83e-b2ba3ceb86af" + "jp-tok" = "r022-01531301-d100-44ba-b1a3-12e7c8d65469" + "jp-osa" = "r034-ac455775-c667-4d3e-b281-9ef845080599" + "au-syd" = "r026-eff4d59c-5006-46cc-8b03-60514f763a87" + "br-sao" = "r042-1e1bbeeb-3ef7-4f7a-a44c-9f50609bb538" + "ca-tor" = "r038-bb9fcdb7-d200-4cdd-af04-6848007c9cb2" + }, + "hpc-lsf-fp15-compute-rhel810-v1" = { + "eu-es" = "r050-f0608e39-9dcf-4aca-9e92-7719474b3e86" + "eu-gb" = "r018-db8b97a8-6f87-4cf7-a044-847da6ab5c59" + "eu-de" = "r010-957efd6b-e7b3-4249-8644-6184f1531915" + "us-east" = "r014-5fdd6a25-5943-4084-9c57-b900a80579a3" + "us-south" = "r006-5c0e462a-679c-4a18-81a5-0fe036f483a3" + "jp-tok" = "r022-8087a984-8912-42ff-9576-c5cab8edda3a" + "jp-osa" = "r034-728d1f12-7842-412c-97a0-9deb66c23962" + "au-syd" = "r026-f957ed22-9565-441c-bce6-f716360e02ea" + "br-sao" = "r042-7bf7d508-a7b1-4434-ae6a-6986f7042d4e" + "ca-tor" = "r038-a658da44-f1b4-4e02-826a-38b16e6ae98a" + }, + "hpc-lsf-fp14-rhel810-v1" = { + "eu-es" = "r050-12a3533c-5fa1-4bcc-8765-7150a06e122e" + "eu-gb" = "r018-3ef87e4e-0f46-424a-b623-fa25215094c0" + "eu-de" = "r010-48e5560b-4d34-43ca-b824-2d85513f3188" + "us-east" = "r014-3719a4e2-6746-4eaf-844a-c3721b7c6d32" + "us-south" = "r006-e720ec63-5e8c-46ce-b7a2-51c454e64099" + "jp-tok" = "r022-917ce78b-dacf-4008-b6c0-4058bf59a5b4" + "jp-osa" = "r034-507fb655-4164-45b8-b1d7-f6cb2fbeafc9" + "au-syd" = "r026-01900450-7314-42ea-aee3-acf5179300c0" + "br-sao" = "r042-bb407137-93cf-4ec7-aa77-4702896fff97" + "ca-tor" = "r038-6683403d-1cf5-4f39-a96f-c8cbb2314ad5" + }, + "hpc-lsf-fp14-compute-rhel810-v1" = { + "eu-es" = "r050-d2ad9625-1668-4b2c-a8bb-6ef14678d3ed" + "eu-gb" = "r018-f1059503-27ec-44d4-a981-21be6225520a" + "eu-de" = "r010-8115b1f6-912e-4b55-89f1-e448c397115e" + "us-east" = "r014-5108884c-011b-4473-b585-0d43309c37e3" + "us-south" = "r006-68c6af72-1abf-4d13-bca1-4f42be5d2c70" + "jp-tok" = "r022-1932c5ec-b5a6-4262-aa56-6c6257c8297f" + "jp-osa" = "r034-50be9bd9-9623-4ffc-8ce7-aab66f674137" + "au-syd" = "r026-11aee148-c938-4524-91e6-8e6da5933a42" + "br-sao" = "r042-5cb62448-e771-4caf-a556-28fdf88acab9" + "ca-tor" = "r038-fa815ec1-d52e-42b2-8221-5b8c2145a248" + } + } +} diff --git a/modules/write_inventory/locals.tf b/modules/write_inventory/locals.tf new file mode 100644 index 00000000..97285d13 --- /dev/null +++ b/modules/write_inventory/locals.tf @@ -0,0 +1,13 @@ +locals { + region = join("-", slice(split("-", var.zones[0]), 0, 2)) + vcpus = tonumber(data.ibm_is_instance_profile.dynamic_worker_profile.vcpu_count[0].value) + ncores = tonumber(local.vcpus / 2) + ncpus = tonumber(var.enable_hyperthreading ? local.vcpus : local.ncores) + mem_in_mb = tonumber(data.ibm_is_instance_profile.dynamic_worker_profile.memory[0].value) * 1024 + rc_max_num = tonumber(var.dynamic_compute_instances[0].count) + rc_profile = var.dynamic_compute_instances[0].profile + boot_volume_encryption_key = jsonencode(var.kms_encryption_enabled ? var.boot_volume_encryption_key : null) + compute_image_found_in_map = contains(keys(local.image_region_map), var.dynamic_compute_instances[0]["image"]) + new_compute_image_id = local.compute_image_found_in_map ? local.image_region_map[var.dynamic_compute_instances[0]["image"]][local.region] : "Image not found with the given name" + image_id = local.compute_image_found_in_map ? local.new_compute_image_id : data.ibm_is_image.dynamic_compute[0].id +} diff --git a/modules/write_inventory/main.tf b/modules/write_inventory/main.tf new file mode 100644 index 00000000..72f87020 --- /dev/null +++ b/modules/write_inventory/main.tf @@ -0,0 +1,44 @@ +# Write provisioned infrastructure details to JSON. +resource "local_sensitive_file" "infra_details_to_json" { + content = < 0 ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${var.bastion_fip} lsfadmin@${local.mgmt_hosts_ips[0]}" : null +} + +output "ssh_to_login_node" { + description = "SSH command to connect to the Login node" + value = var.scheduler == "LSF" && (var.enable_deployer == false) ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${var.bastion_fip} lsfadmin@${local.login_host_ip[0]}" : null +} + +output "ssh_to_ldap_node" { + description = "SSH command to connect to LDAP node" + value = (var.scheduler == "LSF" && var.enable_deployer == false && var.enable_ldap && length(local.ldap_hosts_ips) > 0) ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -J ubuntu@${var.bastion_fip} ubuntu@${local.ldap_hosts_ips[0]}" : null +} + +output "cloud_monitoring_url" { + value = var.observability_monitoring_enable && (var.enable_deployer == false) ? module.cloud_monitoring_instance_creation[0].cloud_monitoring_url : null + description = "IBM Cloud Monitoring URL" +} + +output "cloud_logs_url" { + value = (var.enable_deployer == false) && (var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute) ? module.cloud_monitoring_instance_creation[0].cloud_logs_url : null + description = "IBM Cloud Logs URL" +} + +output "application_center_tunnel" { + description = "Available if IBM Spectrum LSF Application Center GUI is installed" + value = var.enable_deployer ? "" : local.ssh_cmd +} + +output "application_center_url" { + description = "Available if IBM Spectrum LSF Application Center GUI is installed" + value = "https://localhost:8443" +} diff --git a/solutions/hpc/provider.tf b/provider.tf similarity index 100% rename from solutions/hpc/provider.tf rename to provider.tf diff --git a/samples/configs/hpc_catalog_values.json b/samples/configs/hpc_catalog_values.json index 254f9e50..f9ad780b 100644 --- a/samples/configs/hpc_catalog_values.json +++ b/samples/configs/hpc_catalog_values.json @@ -1,76 +1,68 @@ { - "ibmcloud_api_key" : "Please fill here", - "resource_group" : "Default", - "zones" : "[\"us-east-1\"]", - "cluster_prefix" : "hpcaas", - "cluster_name" : "Please fill here", - "bastion_ssh_keys" : "[\"Please fill here\"]", - "compute_ssh_keys" : "[\"Please fill here\"]", - "remote_allowed_ips" : "[\"Please fill here\"]", - "vpc_name" : "__NULL__", - "cluster_subnet_ids" : "[]", - "login_subnet_id" : "__NULL__", - "vpc_cidr" : "10.241.0.0/18", - "vpc_cluster_private_subnets_cidr_blocks" : "[\"10.241.0.0/20\"]", - "vpc_cluster_login_private_subnets_cidr_blocks" : "[\"10.241.16.0/28\"]", - "dns_domain_name" : "{compute = \"lsf.com\"}", - "dns_instance_id" : "__NULL__", - "dns_custom_resolver_id" : "__NULL__", - "enable_cos_integration" : "false", - "cos_instance_name" : "__NULL__", - "enable_fip" : "true", - "management_image_name" : "hpc-lsf10-rhel810-v2", - "compute_image_name" : "hpcaas-lsf10-rhel810-compute-v8", - "login_image_name" : "hpcaas-lsf10-rhel810-compute-v8", - "login_node_instance_type" : "bx2-2x8", - "management_node_instance_type" : "bx2-16x64", - "management_node_count" : "2", - "worker_node_instance_type" : "[{count = 0, instance_type = \"bx2-4x16\" }, {count = 0, instance_type = \"cx2-4x16\"}]", - "worker_node_max_count" : "10", - "custom_file_shares" : "[{mount_path = \"/mnt/vpcstorage/tools\", size = 100, iops = 2000 }, { mount_path = \"/mnt/vpcstorage/data\", size = 100, iops = 6000 }, { mount_path = \"/mnt/scale/tools\", nfs_share = \"\" }]", - "storage_security_group_id" : "__NULL__", - "hyperthreading_enabled" : "true", - "vpn_enabled" : "false", - "TF_VERSION" : "1.9", - "TF_PARALLELISM" : "250", - "key_management" : "key_protect", - "kms_instance_name" : "__NULL__", - "kms_key_name" : "__NULL__", - "enable_app_center" : "false", - "app_center_gui_pwd" : "", - "app_center_high_availability" : "false", - "app_center_existing_certificate_instance" : "", - "enable_vpc_flow_logs" : "true", - "enable_ldap" : "false", - "ldap_basedns" : "lsf.com", - "ldap_server" : "null", - "ldap_server_cert" : "null", - "ldap_admin_password" : "", - "ldap_user_name" : "", - "ldap_user_password" : "", - "ldap_vsi_profile" : "cx2-2x4", - "ldap_vsi_osimage_name" : "ibm-ubuntu-22-04-4-minimal-amd64-3", - "skip_iam_block_storage_authorization_policy" : "false", - "skip_iam_share_authorization_policy" : "false", - "scc_enable" : "false", - "scc_profile" : "CIS IBM Cloud Foundations Benchmark v1.1.0", - "scc_location" : "us-south", - "scc_event_notification_plan" : "lite", - "observability_monitoring_enable" : "true", - "observability_monitoring_on_compute_nodes_enable" : "false", - "observability_monitoring_plan" : "graduated-tier", - "existing_bastion_instance_name" : "__NULL__", + "ibmcloud_api_key": "Please fill here", + "existing_resource_group": "Default", + "zones": "[\"us-east-1\"]", + "cluster_prefix": "hpc-lsf", + "ssh_keys": "[\"Please fill here\"]", + "remote_allowed_ips": "[\"Please fill here\"]", + "app_center_gui_password": "Please fill here", + "lsf_version": "Fixpack_15", + "vpc_name": "__NULL__", + "cluster_subnet_id": "[]", + "login_subnet_id": "__NULL__", + "vpc_cidr": "10.241.0.0/18", + "vpc_cluster_private_subnets_cidr_blocks": "10.241.0.0/20", + "vpc_cluster_login_private_subnets_cidr_blocks": "10.241.16.0/28", + "dns_domain_name": "{compute = \"lsf.com\"}", + "dns_instance_id": "__NULL__", + "dns_custom_resolver_id": "__NULL__", + "bastion_instance": "{ image = \"ibm-ubuntu-22-04-5-minimal-amd64-3\", profile = \"cx2-4x8\" }", + "deployer_instance": "{ image = \"hpc-lsf-fp15-deployer-rhel810-v1\", profile = \"bx2-8x32\" }", + "login_instance": "[{ profile = \"bx2-2x8\", image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "management_instances": "[{ profile = \"bx2-16x64\", count = 2, image = \"hpc-lsf-fp15-rhel810-v1\" }]", + "static_compute_instances": "[{ profile = \"bx2-4x16\", count = 0, image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "dynamic_compute_instances": "[{ profile = \"bx2-4x16\", count = 500, image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "custom_file_shares": "[{mount_path = \"/mnt/vpcstorage/tools\", size = 100, iops = 2000 }, { mount_path = \"/mnt/vpcstorage/data\", size = 100, iops = 6000 }, { mount_path = \"/mnt/scale/tools\", nfs_share = \"\" }]", + "storage_security_group_id": "__NULL__", + "key_management": "key_protect", + "kms_instance_name": "__NULL__", + "kms_key_name": "__NULL__", + "enable_vpc_flow_logs": "false", + "enable_ldap": "false", + "ldap_basedns": "lsf.com", + "ldap_server": "null", + "ldap_server_cert": "null", + "ldap_admin_password": "", + "ldap_user_name": "", + "ldap_user_password": "", + "ldap_instance": "[{ profile = \"cx2-2x4\", image = \"ibm-ubuntu-22-04-5-minimal-amd64-1\" }]", + "skip_iam_share_authorization_policy": "false", + "skip_flowlogs_s2s_auth_policy": "false", + "skip_kms_s2s_auth_policy": "false", + "skip_iam_block_storage_authorization_policy": "false", + "sccwp_enable": "true", + "app_config_plan": "basic", + "cspm_enabled": "true", + "sccwp_service_plan": "free-trial", + "observability_atracker_enable": "true", + "observability_atracker_target_type": "cloudlogs", + "observability_monitoring_enable": "true", + "observability_logs_enable_for_management": "false", + "observability_logs_enable_for_compute": "false", + "observability_enable_platform_logs": "false", + "observability_enable_metrics_routing": "false", + "observability_logs_retention_period": "false", + "observability_monitoring_on_compute_nodes_enable": "false", + "observability_monitoring_plan": "graduated-tier", + "existing_bastion_instance_name": "__NULL__", "existing_bastion_instance_public_ip": "__NULL__", - "existing_bastion_security_group_id" : "__NULL__", - "existing_bastion_ssh_private_key" : "__NULL__", - "observability_atracker_enable" : "true", - "observability_atracker_target_type" : "cloudlogs", - "cos_expiration_days" : "30", - "observability_logs_enable_for_management" : "false", - "observability_logs_enable_for_compute" : "false", - "observability_enable_platform_logs" : "false", - "observability_enable_metrics_routing" : "false", - "observability_logs_retention_period" : "7", - "skip_flowlogs_s2s_auth_policy" : "false", - "enable_dedicated_host" : "false" + "existing_bastion_security_group_id": "__NULL__", + "existing_bastion_ssh_private_key": "__NULL__", + "enable_dedicated_host": "false", + "enable_cos_integration": "false", + "cos_instance_name": "__NULL__", + "enable_hyperthreading": "true", + "vpn_enabled": "false", + "TF_VERSION": "1.9", + "TF_PARALLELISM": "250" } diff --git a/samples/configs/hpc_schematics_values.json b/samples/configs/hpc_schematics_values.json index 87d45a9a..7e3bc426 100644 --- a/samples/configs/hpc_schematics_values.json +++ b/samples/configs/hpc_schematics_values.json @@ -1,7 +1,7 @@ { - "name": "hpcaas-test", + "name": "lsf-test", "type": [ - "terraform_v1.5" + "terraform_v1.9" ], "location": "eu-de", "resource_group": "Default", @@ -13,9 +13,9 @@ }, "template_data": [ { - "folder": "solutions/hpc", + "folder": "solutions/lsf", "type": "terraform_v1.9", - "env_values":[ + "env_values": [ { "TF_CLI_ARGS_apply": "-parallelism=250" }, @@ -26,10 +26,10 @@ "TF_CLI_ARGS_destroy": "-parallelism=100" }, { - "VAR1":"" + "VAR1": "" }, { - "VAR2":"" + "VAR2": "" } ], "variablestore": [ @@ -52,50 +52,42 @@ "value": "Please fill here", "type": "string", "secure": true, - "description": "IBM Cloud API key for the IBM Cloud account where the IBM Cloud HPC cluster needs to be deployed. For more information on how to create an API key, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." + "description": "Provide the IBM Cloud API key associated with the account to deploy the IBM Spectrum LSF cluster. This key is used to authenticate your deployment and grant the necessary access to create and manage resources in your IBM Cloud environment, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." }, { - "name": "resource_group", + "name": "existing_resource_group", "value": "Default", "type": "string", "secure": false, - "description": "Specify the name of the existing resource group in your IBM Cloud account where VPC resources will be deployed. By default, the resource group is set to 'Default.' In some older accounts, it may be 'default,' so please verify the resource group name before proceeding. If the value is set to \\\"null\\\", the automation will create two separate resource groups: 'workload-rg' and 'service-rg.' For more details, see Managing resource groups." - + "description": "Specify the existing resource group name from your IBM Cloud account where the VPC resources should be deployed. By default, the resource group name is set to 'Default.' Note that in some older accounts, the resource group name may be 'default,' so please validate the resource_group name before deployment. If the resource group value is set to the string \"null\", the automation will create two different resource groups named 'workload-rg' and 'service-rg.' For more information on resource groups, refer to Managing resource groups." }, { "name": "zones", - "value": "[\"us-east-1\"]", + "value": "[\"eu-de-1\"]", "type": "list(string)", "secure": false, "description": "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." }, { - "name": "cluster_prefix", - "value": "hpcaas", - "type": "string", - "secure": false, - "description": "The prefix is used to name the IBM Cloud LSF cluster and the resources provisioned to build the cluster instance. Each Spectrum LSF cluster must have a unique name, so ensure the prefix is distinct. It must begin with a lowercase letter and can only include lowercase letters, digits, and hyphens. Hyphens must be followed by a lowercase letter or digit, with no leading, trailing, or consecutive hyphens. The prefix length must be less than 16 characters." - }, - { - "name": "cluster_name", - "value": "Please fill here", + "name": "lsf_version", + "value": "fixpack_15", "type": "string", "secure": false, - "description": "Provide a unique cluster name that LSF uses to configure and group the cluster. Without this name, LSF cannot form a cluster, and the initial deployments will fail. The cluster name can be up to 39 alphanumeric characters and may include underscores (_), hyphens (-), and periods (.). Spaces and other special characters are not allowed. Avoid using the name of any host or user as the cluster name. Note that the cluster name cannot be changed after deployment." + "description": "Select the desired version of IBM Spectrum LSF to deploy either fixpack_15 or fixpack_14. By default, the solution uses the latest available version, which is Fix Pack 15. If you need to deploy an earlier version such as Fix Pack 14, update the lsf_version field to fixpack_14. When changing the LSF version, ensure that all custom images used for management, compute, and login nodes correspond to the same version. This is essential to maintain compatibility across the cluster and to prevent deployment issues." }, { - "name": "bastion_ssh_keys", + "name": "ssh_keys", "value": "[\"Please fill here\"]", "type": "list(string)", "secure": false, - "description": "Provide the list of SSH key names configured in your IBM Cloud account to establish a connection to the Spectrum LSF bastion and login node. Make sure the SSH key exists in the same resource group and region where the cluster is being provisioned. To pass multiple SSH keys, use the format [\"key-name-1\", \"key-name-2\"]. If you don't have an SSH key in your IBM Cloud account, you can create one by following the provided .[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys)." + "description": "Provide the list of SSH key names configured in your IBM Cloud account to establish a connection to the IBM Cloud HPC bastion and login node. Ensure the SSH key is present in the same resource group and region where the cluster is being provisioned. If you do not have an SSH key in your IBM Cloud account, create one by following the provided instructions.[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys)." }, { - "name": "compute_ssh_keys", - "value": "[\"Please fill here\"]", - "type": "list(string)", - "secure": false, - "description": "Provide the list of SSH key names configured in your IBM Cloud account to establish a connection to the Spectrum LSF cluster node. Ensure the SSH key is present in the same resource group and region where the cluster is being provisioned. To pass multiple SSH keys, use the format [\"key-name-1\", \"key-name-2\"]. If you do not have an SSH key in your IBM Cloud account, create one by following the provided instructions.[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys)." + "name": "app_center_gui_password", + "value": "Please fill here", + "type": "string", + "secure": true, + "description": "Password required to access the IBM Spectrum LSF Application Center (App Center) GUI, which is enabled by default in both Fix Pack 15 and Fix Pack 14 with HTTPS. This is a mandatory value and omitting it will result in deployment failure. The password must meet the following requirements, at least 8 characters in length, and must include one uppercase letter, one lowercase letter, one number, and one special character." }, { "name": "remote_allowed_ips", @@ -104,6 +96,13 @@ "secure": false, "description": "Comma-separated list of IP addresses that can access the IBM Cloud HPC cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." }, + { + "name": "cluster_prefix", + "value": "hpc-lsf", + "type": "string", + "secure": false, + "description": "Prefix that is used to name the IBM Cloud HPC cluster and IBM Cloud resources that are provisioned to build the IBM Cloud HPC cluster instance. You cannot create more than one instance of the IBM Cloud HPC cluster with the same name. Ensure that the name is unique. Prefix must start with a lowercase letter and contain only lowercase letters, digits, and hyphens in between. Hyphens must be followed by at least one lowercase letter or digit. There are no leading, trailing, or consecutive hyphens.Character length for cluster_prefix should be less than 16." + }, { "name": "vpc_name", "value": "__NULL__", @@ -112,18 +111,18 @@ "description": "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" }, { - "name": "cluster_subnet_ids", - "value": "[]", - "type": "list(string)", + "name": "cluster_subnet_id", + "value": "__NULL__", + "type": "string", "secure": false, - "description": "Provide the list of existing subnet ID under the existing VPC where the cluster will be provisioned. One subnet ID is required as input value. The management nodes, file storage shares, and compute nodes will be deployed in the same zone." + "description": "Provide the list of existing subnet ID under the existing VPC where the cluster will be provisioned. One subnet ID is required as input value. Supported zones are: eu-de-2 and eu-de-3 for eu-de, us-east-1 and us-east-3 for us-east, and us-south-1 for us-south. The management nodes, file storage shares, and compute nodes will be deployed in the same zone." }, { "name": "login_subnet_id", "value": "__NULL__", "type": "string", "secure": false, - "description": "Provide the list of existing subnet ID under the existing VPC, where the login/bastion server will be provisioned. One subnet id is required as input value for the creation of login node and bastion in the same zone as the management nodes. Note: Provide a different subnet id for login_subnet_id, do not overlap or provide the same subnet id that was already provided for cluster_subnet_ids." + "description": "Provide the list of existing subnet ID under the existing VPC, where the login/bastion server will be provisioned. One subnet id is required as input value for the creation of login node and bastion in the same zone as the management nodes. Note: Provide a different subnet id for login_subnet_id, do not overlap or provide the same subnet id that was already provided for cluster_subnet_id." }, { "name": "vpc_cidr", @@ -134,15 +133,15 @@ }, { "name": "vpc_cluster_private_subnets_cidr_blocks", - "value": "[\"10.241.0.0/20\"]", - "type": "list(string)", + "value": "10.241.0.0/20", + "type": "string", "secure": false, "description": "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." }, { "name": "vpc_cluster_login_private_subnets_cidr_blocks", - "value": "[\"10.241.16.0/28\"]", - "type": "list(string)", + "value": "10.241.16.0/28", + "type": "string", "secure": false, "description": "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." }, @@ -158,7 +157,7 @@ "value": "__NULL__", "type": "string", "secure": false, - "description": "Provide the ID of an existing IBM Cloud DNS service instance to avoid creating a new one. Note: If dns_instance_id is not set to null, a new DNS zone will be created within the specified DNS service instance." + "description": "Provide the id of existing IBM Cloud DNS services domain to skip creating a new DNS service instance name.Note: If dns_instance_id is not equal to null, a new dns zone will be created under the existing dns service instance." }, { "name": "dns_custom_resolver_id", @@ -182,88 +181,67 @@ "description": "Provide the name of the existing cos instance to store vpc flow logs." }, { - "name": "enable_fip", - "value": "true", - "type": "bool", - "secure": false, - "description": "The solution supports multiple ways to connect to your IBM Cloud HPC cluster for example, using a login node, or using VPN or direct connection. If connecting to the IBM Cloud HPC cluster using VPN or direct connection, set this value to false." - }, - { - "name": "management_image_name", - "value": "hpc-lsf10-rhel810-v2", - "type": "string", + "name": "bastion_instance", + "value": "{image = \"ibm-ubuntu-22-04-5-minimal-amd64-3\", profile = \"cx2-4x8\"}", + "type": "object({ image = string, profile = string })", "secure": false, - "description": "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster management nodes. By default, the solution uses a RHEL810 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." + "description": "Specify the Bastion node configuration, including instance profile and image name. Only Ubuntu stock images available in the IBM Cloud account and region are supported." }, { - "name": "compute_image_name", - "value": "hpcaas-lsf10-rhel810-compute-v8", - "type": "string", + "name": "deployer_instance", + "value": "{image = \"hpc-lsf-fp15-deployer-rhel810-v1\", profile = \"bx2-8x32\" }", + "type": "object({ image = string, profile = string })", "secure": false, - "description": "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster compute (static/dynamic) nodes. By default, the solution uses a RHEL 8-10 base OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." - + "description": "Specify the deployer node configuration, including instance profile and image name. By default, the image is set for fixpack_15. Use 'hpc-lsf-fp14-deployer-rhel810-v1' if deploying with fixpack_14, and ensure lsf_version matches the image." }, { - "name": "login_image_name", - "value": "hpcaas-lsf10-rhel810-compute-v8", - "type": "string", + "name": "management_instances", + "value": "[{ profile = \"bx2-16x64\", count = 2, image = \"hpc-lsf-fp15-rhel810-v1\" }]", + "type": "list(object({ profile = string, count = number, image = string }))", "secure": false, - "description": "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster login node. By default, the solution uses a RHEL 8-10 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." + "description": "Specify the list of management node configurations, including instance profile, image name, and count. By default, all management nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. The solution allows customization of instance profiles and counts, but mixing custom images and IBM stock images across instances is not supported. If using IBM stock images, only Red Hat-based images are allowed." }, { - "name": "login_node_instance_type", - "value": "bx2-2x8", - "type": "string", + "name": "static_compute_instances", + "value": "[{ profile = \"bx2-4x16\", count = 0, image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "type": "list(object({ profile = string, count = number, image = string }))", "secure": false, - "description": "Specify the virtual server instance profile type to be used to create the login node for the IBM Cloud HPC cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." + "description": "Specify the list of static compute node configurations, including instance profile, image name, and count. By default, all compute nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. The solution allows customization of instance profiles and counts, but mixing custom images and IBM stock images across instances is not supported. If using IBM stock images, only Red Hat-based images are allowed." }, { - "name": "worker_node_instance_type", - "value": "[{count = 0, instance_type = \"bx2-4x16\" }, {count = 0, instance_type = \"cx2-4x16\"}]", - "type": "list(object({count = number,instance_type = string}))", + "name": "dynamic_compute_instances", + "value": "[{ profile = \"bx2-4x16\", count = 500, image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "type": "list(object({ profile = string, count = number, image = string }))", "secure": false, - "description": "The minimum number of worker nodes represents the static nodes provisioned during cluster creation. The solution supports different instance types, so specify the node count based on the requirements for each instance profile. For dynamic node provisioning, the automation will select the first profile from the list. Ensure sufficient account-level capacity if specifying a higher instance profile.For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." + "description": "Specify the list of dynamic compute node configurations, including instance profile, image name, and count. By default, all dynamic compute nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. Currently, only a single instance profile is supported for dynamic compute nodes—multiple profiles are not yet supported.." }, { - "name": "worker_node_max_count", - "value": "10", - "type": "number", + "name": "login_instance", + "value": "[{ profile = \"bx2-2x8\", image = \"hpc-lsf-fp15-compute-rhel810-v1\" }]", + "type": "list(object({ profile = string, image = string }))", "secure": false, - "description": "The maximum number of worker nodes that can be deployed in the Spectrum LSF cluster. In order to use the [Resource Connector](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=lsf-resource-connnector) feature to dynamically create and delete worker nodes based on workload demand, the value selected for this parameter must be larger than the totall count of worker_node_instance_type. If you plan to deploy only static worker nodes in the LSF cluster." - }, - { - "name": "management_node_instance_type", - "value": "bx2-16x64", - "type": "string", - "secure": false, - "description" : "Specify the virtual server instance profile type to be used to create the management nodes for the IBM Spectrum LSF cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - }, - { - "name": "management_node_count", - "value": "2", - "type": "number", - "secure": false, - "description": "Number of management nodes. This is the total number of management nodes. Enter a value between 1 and 10." + "description": "Specify the list of login node configurations, including instance profile, image name. By default, login nodes is created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures." }, { "name": "custom_file_shares", "value": "[{mount_path = \"/mnt/vpcstorage/tools\", size = 100, iops = 2000 }, { mount_path = \"/mnt/vpcstorage/data\", size = 100, iops = 6000 }, { mount_path = \"/mnt/scale/tools\", nfs_share = \"\" }]", "type": "list(object({mount_path = string,size = optional(number),iops = optional(number),nfs_share = optional(string)}))", "secure": false, - "description": "Provide details for customizing your shared file storage layout, including mount points, sizes (in GB), and IOPS ranges for up to five file shares if using VPC file storage as the storage option.If using IBM Storage Scale as an NFS mount, update the appropriate mount path and nfs_share values created from the Storage Scale cluster. Note that VPC file storage supports attachment to a maximum of 256 nodes. Exceeding this limit may result in mount point failures due to attachment restrictions.For more information, see [Storage options](https://test.cloud.ibm.com/docs/hpc-ibm-spectrumlsf?topic=hpc-ibm-spectrumlsf-integrating-scale#integrate-scale-and-hpc)." + "description": "Mount points and sizes in GB and IOPS range of file shares that can be used to customize shared file storage layout. Provide the details for up to 5 shares. Each file share size in GB supports different range of IOPS. For more information, see [file share IOPS value](https://cloud.ibm.com/docs/vpc?topic=vpc-file-storage-profiles&interface=ui)" }, - { "name": "storage_security_group_id", + { + "name": "storage_security_group_id", "value": "__NULL__", "type": "string", "secure": false, - "description" : "Provide the storage security group ID from the Spectrum Scale storage cluster if the mount_path in the cluster_file_share variable is set to use Scale fileset mount points. This security group is essential for establishing connections between the Spectrum LSF cluster nodes and NFS mount points, ensuring the nodes can access the specified mount points." + "description": "Provide the storage security group ID from the Spectrum Scale storage cluster when an nfs_share value is specified for a given mount_path in the cluster_file_share variable. This security group is necessary to enable network connectivity between the Spectrum LSF cluster nodes and the NFS mount point, ensuring successful access to the shared file system." }, { - "name": "hyperthreading_enabled", + "name": "enable_hyperthreading", "value": "true", "type": "bool", "secure": false, - "description": "Enabling this setting (true by default) allows hyper-threading on the nodes of the cluster, improving overall processing efficiency by permitting each CPU core to execute multiple threads simultaneously. If set to false, hyperthreading will be disabled, which may be preferable for certain workloads requiring dedicated, non-threaded CPU resources for optimal performance. Carefully consider the nature of your computational tasks when configuring this option to achieve the best balance between performance and resource utilization." + "description": "Setting this to true will enable hyper-threading in the compute nodes of the cluster (default). Otherwise, hyper-threading will be disabled." }, { "name": "vpn_enabled", @@ -293,38 +271,12 @@ "secure": false, "description": "Provide the existing kms key name that you want to use for the IBM Cloud HPC cluster. Note: kms_key_name to be considered only if key_management value is set as key_protect.(for example kms_key_name: my-encryption-key)." }, - { - "name": "enable_app_center", - "value": "false", - "type": "bool", - "secure": false, - "description": "Set to true to enable the IBM Spectrum LSF Application Center GUI (default: false). [System requirements](https://www.ibm.com/docs/en/slac/10.2.0?topic=requirements-system-102-fix-pack-14) for IBM Spectrum LSF Application Center Version 10.2 Fix Pack 14." - }, - { - "name": "app_center_gui_pwd", - "value": "", - "type": "string", - "secure": true, - "description": "Password for IBM Spectrum LSF Application Center GUI. Note: Password should be at least 8 characters, must have one number, one lowercase letter, one uppercase letter, and at least one special character." - }, - { "name": "app_center_high_availability", - "value": "false", - "type": "bool", - "secure": false, - "description": "Set to false to disable the IBM Spectrum LSF Application Center GUI High Availability (default: true)." - }, - { "name": "app_center_existing_certificate_instance", - "value": "", - "type": "string", - "secure": false, - "description": "When app_center_high_availability is enable/set as true, The Application Center will be configured for high availability and requires a Application Load Balancer Front End listener to use a certificate CRN value stored in the Secret Manager. Provide the valid 'existing_certificate_instance' to configure the Application load balancer." - }, { "name": "enable_vpc_flow_logs", "value": "false", "type": "bool", "secure": false, - "description": "This flag determines whether VPC flow logs are enabled. When set to true, a flow log collector will be created to capture and monitor network traffic data within the VPC. Enabling flow logs provides valuable insights for troubleshooting, performance monitoring, and security auditing by recording information about the traffic passing through your VPC. Consider enabling this feature to enhance visibility and maintain robust network management practices." + "description": "Flag to enable VPC flow logs. If true, a flow log collector will be created." }, { "name": "enable_ldap", @@ -376,25 +328,18 @@ "description": "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." }, { - "name": "ldap_vsi_profile", - "value": "cx2-2x4", - "type": "string", + "name": "ldap_instance", + "value": "[{ profile = \"cx2-2x4\", image = \"ibm-ubuntu-22-04-5-minimal-amd64-3\" }]", + "type": "list(object({ profile = string, count = number, image = string }))", "secure": false, - "description": "Profile to be used for LDAP virtual server instance." - }, - { - "name": "ldap_vsi_osimage_name", - "value": "ibm-ubuntu-22-04-4-minimal-amd64-3", - "type": "string", - "secure": false, - "description": "Image name to be used for provisioning the LDAP instances." + "description": "Specify the list of login node configurations, including instance profile, image name. By default, login nodes is created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures." }, { "name": "skip_iam_block_storage_authorization_policy", "value": "false", "type": "bool", "secure": false, - "description": "Set to false if authorization policy is required for VPC block storage volumes to access kms. This can be set to true if authorization policy already exists. For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." + "description": "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the block storage volume. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." }, { "name": "skip_iam_share_authorization_policy", @@ -404,32 +349,46 @@ "description": "Set it to false if authorization policy is required for VPC file share to access kms. This can be set to true if authorization policy already exists. For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." }, { - "name": "scc_enable", + "name": "skip_flowlogs_s2s_auth_policy", + "value": "false", + "type": "bool", + "secure": false, + "description": "When using an existing COS instance, set this value to true if authorization is already enabled between COS instance and the flow logs service. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.." + }, + { + "name": "skip_kms_s2s_auth_policy", "value": "false", "type": "bool", "secure": false, - "description": "Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created." + "description": "When using an existing COS instance, set this value to true if authorization is already enabled between COS instance and the kms. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment." + }, + { + "name": "sccwp_enable", + "value": "true", + "type": "bool", + "secure": false, + "description": "Set this flag to true to create an instance of IBM Security and Compliance Center (SCC) Workload Protection. When enabled, it provides tools to discover and prioritize vulnerabilities, monitor for security threats, and enforce configuration, permission, and compliance policies across the full lifecycle of your workloads." }, { - "name": "scc_profile", - "value": "CIS IBM Cloud Foundations Benchmark v1.1.0", + "name": "sccwp_service_plan", + "value": "free-trial", "type": "string", "secure": false, - "description": "Profile to be set on the SCC Instance (accepting empty, 'CIS IBM Cloud Foundations Benchmark v1.1.0' and 'IBM Cloud Framework for Financial Services')" + "description": "Specify the plan type for the Security and Compliance Center (SCC) Workload Protection instance. Valid values are free-trial and graduated-tier only." }, { - "name": "scc_location", - "value": "us-south", + "name": "cspm_enabled", + "value": "true", "type": "string", "secure": false, - "description": "Location where the SCC instance is provisioned (possible choices 'us-south', 'eu-de', 'ca-tor', 'eu-es')" + "description": "Enable Cloud Security Posture Management (CSPM) for the Workload Protection instance. This will create a trusted profile associated with the SCC Workload Protection instance that has viewer / reader access to the App Config service and viewer access to the Enterprise service. [Learn more](https://cloud.ibm.com/docs/workload-protection?topic=workload-protection-about)." }, { - "name": "scc_event_notification_plan", - "value": "lite", + "name": "app_config_plan", + "value": "basic", "type": "string", "secure": false, - "description": "Event Notifications Instance plan to be used (it's used with S.C.C. instance), possible values 'lite' and 'standard'" + "description": "Specify the IBM service pricing plan for the application. Allowed values are 'basic', 'lite', 'standard', 'enterprise'." }, { "name": "observability_monitoring_enable", "value": "true", @@ -465,13 +424,6 @@ "secure": false, "description": "Determines where all events can be stored based on the user input. Select the desired target type to retrieve or capture events into your system." }, - { - "name": "cos_expiration_days", - "value": "30", - "type": "number", - "secure": false, - "description": "Specify the retention period for objects in COS buckets by setting the number of days after their creation for automatic expiration. This configuration helps manage storage efficiently by removing outdated or unnecessary data, reducing storage costs, and maintaining data lifecycle policies. Ensure that the specified duration aligns with your data retention and compliance requirements." - }, { "name": "observability_logs_enable_for_management", "value": "false", @@ -508,46 +460,32 @@ "description": "The number of days IBM Cloud Logs retains the logs data in priority insights. By default the value is set as 7, but the allowed values are 14, 30, 60, and 90." }, { - "name": "skip_flowlogs_s2s_auth_policy", - "value": "false", - "type": "bool", - "secure": false, - "description": "Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account." - }, - { - "name": "enable_dedicated_host", - "value": "false", - "type": "bool", - "secure": false, - "description": "Set this option to true to enable dedicated hosts for the VSI created for workload servers. The default value is false. When a dedicated host is enabled, the solution supports only static worker nodes with a single profile, and multiple profile combinations are not supported. For example, you can select a profile from a single family, such as bx2, cx2, or mx2. If you are provisioning a static cluster with a third-generation profile, ensure that dedicated hosts are supported in the chosen regions, as not all regions support dedicated hosts for third-gen profiles. To learn more about dedicated host, [click here.](https://cloud.ibm.com/docs/vpc?topic=vpc-dh-profiles&interface=ui)." - }, - { - "name": "bastion_instance_name", + "name": "existing_bastion_instance_name", "value": "__NULL__", "type": "string", "secure": false, - "description" : "Provide the name of the bastion instance. If none given then new bastion will be created." + "description": "Provide the name of the bastion instance. If none given then new bastion will be created." }, { - "name": "bastion_instance_public_ip", + "name": "existing_bastion_instance_public_ip", "value": "__NULL__", "type": "string", "secure": false, - "description" : "Provide the public ip address of the bastion instance to establish the remote connection." + "description": "Provide the public ip address of the bastion instance to establish the remote connection." }, { - "name": "bastion_security_group_id", + "name": "existing_bastion_security_group_id", "value": "__NULL__", "type": "string", "secure": false, - "description" : "Specify the security group ID for the bastion server. This ID will be added as an allowlist rule on the HPC cluster nodes to facilitate secure SSH connections through the bastion node. By restricting access through a bastion server, this setup enhances security by controlling and monitoring entry points into the cluster environment. Ensure that the specified security group is correctly configured to permit only authorized traffic for secure and efficient management of cluster resources." + "description": "Provide the security group ID of the bastion server. This security group ID will be added as an allowlist rule on the HPC cluster nodes to establish an SSH connection through the bastion node." }, { - "name": "bastion_ssh_private_key", + "name": "existing_bastion_ssh_private_key", "value": "__NULL__", "type": "string", "secure": false, - "description" : "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." + "description": "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." } ] } diff --git a/solutions/custom/README.md b/solutions/custom/README.md new file mode 100644 index 00000000..e69de29b diff --git a/solutions/custom/catalogValidationValues.json.template b/solutions/custom/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/custom/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/custom/locals.tf b/solutions/custom/locals.tf new file mode 100644 index 00000000..70e246d4 --- /dev/null +++ b/solutions/custom/locals.tf @@ -0,0 +1,98 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + deployer_instance = var.deployer_instance + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + compute_gui_password = var.compute_gui_password + compute_gui_username = var.compute_gui_username + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_names = var.dns_domain_names + dynamic_compute_instances = var.dynamic_compute_instances + enable_atracker = var.enable_atracker + # enable_bastion = var.enable_bastion + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + custom_file_shares = var.custom_file_shares + hpcs_instance_name = var.hpcs_instance_name + key_management = var.key_management + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + management_instances = var.management_instances + vpc_cidr = var.vpc_cidr + nsd_details = var.nsd_details + placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + static_compute_instances = var.static_compute_instances + storage_gui_password = var.storage_gui_password + storage_gui_username = var.storage_gui_username + storage_instances = var.storage_instances + storage_subnets_cidr = var.storage_subnets_cidr + vpc_name = var.vpc_name + } +} + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + compute_gui_password = lookup(local.override[local.override_type], "compute_gui_password", local.config.compute_gui_password) + compute_gui_username = lookup(local.override[local.override_type], "compute_gui_username", local.config.compute_gui_username) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_names = lookup(local.override[local.override_type], "dns_domain_names", local.config.dns_domain_names) + dynamic_compute_instances = lookup(local.override[local.override_type], "dynamic_compute_instances", local.config.dynamic_compute_instances) + enable_atracker = lookup(local.override[local.override_type], "enable_atracker", local.config.enable_atracker) + # enable_bastion = lookup(local.override[local.override_type], "enable_bastion", local.config.enable_bastion) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + custom_file_shares = lookup(local.override[local.override_type], "custom_file_shares", local.config.custom_file_shares) + hpcs_instance_name = lookup(local.override[local.override_type], "hpcs_instance_name", local.config.hpcs_instance_name) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + management_instances = lookup(local.override[local.override_type], "management_instances", local.config.management_instances) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + nsd_details = lookup(local.override[local.override_type], "nsd_details", local.config.nsd_details) + placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + protocol_instances = lookup(local.override[local.override_type], "protocol_instances", local.config.protocol_instances) + protocol_subnets_cidr = lookup(local.override[local.override_type], "protocol_subnets_cidr", local.config.protocol_subnets_cidr) + static_compute_instances = lookup(local.override[local.override_type], "static_compute_instances", local.config.static_compute_instances) + storage_gui_password = lookup(local.override[local.override_type], "storage_gui_password", local.config.storage_gui_password) + storage_gui_username = lookup(local.override[local.override_type], "storage_gui_username", local.config.storage_gui_username) + storage_instances = lookup(local.override[local.override_type], "storage_instances", local.config.storage_instances) + storage_subnets_cidr = lookup(local.override[local.override_type], "storage_subnets_cidr", local.config.storage_subnets_cidr) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + } +} diff --git a/solutions/custom/main.tf b/solutions/custom/main.tf new file mode 100644 index 00000000..e3ed87f4 --- /dev/null +++ b/solutions/custom/main.tf @@ -0,0 +1,42 @@ +module "custom" { + source = "./../.." + scheduler = var.scheduler + ibm_customer_number = var.ibm_customer_number + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + deployer_instance = local.env.deployer_instance + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_names + dynamic_compute_instances = local.env.dynamic_compute_instances + enable_atracker = local.env.enable_atracker + # enable_bastion = local.env.enable_bastion + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + custom_file_shares = local.env.custom_file_shares + key_management = local.env.key_management + client_instances = local.env.client_instances + management_instances = local.env.management_instances + vpc_cidr = local.env.vpc_cidr + nsd_details = local.env.nsd_details + placement_strategy = local.env.placement_strategy + protocol_instances = local.env.protocol_instances + protocol_subnets_cidr = [local.env.protocol_subnets_cidr] + static_compute_instances = local.env.static_compute_instances + storage_instances = local.env.storage_instances + storage_subnets_cidr = [local.env.storage_subnets_cidr] + vpc_name = local.env.vpc_name + + # compute_gui_password = local.env.compute_gui_password + # compute_gui_username = local.env.compute_gui_username + # client_subnets_cidr = local.env.client_subnets_cidr + # hpcs_instance_name = local.env.hpcs_instance_name + # storage_gui_password = local.env.storage_gui_password + # storage_gui_username = local.env.storage_gui_username +} diff --git a/solutions/custom/outputs.tf b/solutions/custom/outputs.tf new file mode 100644 index 00000000..f178ca4b --- /dev/null +++ b/solutions/custom/outputs.tf @@ -0,0 +1,4 @@ +output "custom" { + description = "Custom details" + value = module.custom +} diff --git a/solutions/custom/override.json b/solutions/custom/override.json new file mode 100644 index 00000000..5a39e286 --- /dev/null +++ b/solutions/custom/override.json @@ -0,0 +1,108 @@ +{ + "cluster_prefix": "lsf", + "existing_resource_group": "Default", + "vpc_name": null, + "network_cidr": "10.0.0.0/8", + "placement_strategy": null, + "enable_bastion": true, + "enable_deployer": false, + "deployer_instance_profile": "mx2-4x32", + "vpc_cluster_login_private_subnets_cidr_blocks": [ + "10.0.0.0/24" + ], + "client_subnets_cidr": [ + "10.10.10.0/24" + ], + "client_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "vpc_cluster_private_subnets_cidr_blocks": [ + "10.10.20.0/24", + "10.20.20.0/24", + "10.30.20.0/24" + ], + "management_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "static_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 0, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "dynamic_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 5000, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "storage_subnets_cidr": [ + "10.10.30.0/24", + "10.20.30.0/24", + "10.30.30.0/24" + ], + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": [ + "10.10.40.0/24", + "10.20.40.0/24", + "10.30.40.0/24" + ], + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "colocate_protocol_instances": false, + "storage_gui_username": "admin", + "nsd_details": [ + { + "capacity": 100, + "iops": 1000, + "profile": "custom" + } + ], + "custom_file_shares": [ + { + "mount_path": "/mnt/binaries", + "size": 100, + "iops": 1000 + }, + { + "mount_path": "/mnt/data", + "size": 100, + "iops": 1000 + } + ], + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_names": { + "compute": "comp.com", + "storage": "strg.com", + "protocol": "ces.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_atracker": true, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "hpcs_instance_name": null +} diff --git a/solutions/custom/variables.tf b/solutions/custom/variables.tf new file mode 100644 index 00000000..a9bb616f --- /dev/null +++ b/solutions/custom/variables.tf @@ -0,0 +1,714 @@ +############################################################################## +# Offering Variations +############################################################################## +variable "scheduler" { + type = string + default = "LSF" + description = "Select one of the scheduler (LSF/Symphony/Slurm/null)" +} + +variable "ibm_customer_number" { + type = string + sensitive = true + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." + validation { + condition = can(regex("^[0-9A-Za-z]*([0-9A-Za-z]+,[0-9A-Za-z]+)*$", var.ibm_customer_number)) + error_message = "The IBM customer number input value cannot have special characters." + } +} + +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "cluster_prefix" { + type = string + default = "lsf" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" + +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## + +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +# variable "enable_bastion" { +# type = bool +# default = true +# description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false." +# } + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets_cidr" { + type = string + default = "10.241.50.0/24" + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for client." +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1024 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Scale Variables +############################################################################## +variable "storage_subnets_cidr" { + type = string + default = "10.241.30.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = string + default = "10.241.40.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for protocol hosts." +} + +# variable "colocate_protocol_instances" { +# type = bool +# default = true +# description = "Enable it to use storage instances as protocol instances" +# } + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "nsd_details" { + type = list( + object({ + profile = string + capacity = optional(number) + iops = optional(number) + }) + ) + default = [{ + capacity = 100 + iops = 1000 + profile = "custom" + }] + description = "Storage scale NSD details" +} + +variable "custom_file_shares" { + type = list( + object({ + mount_path = string, + size = number, + iops = number + }) + ) + default = [{ + mount_path = "/mnt/binaries" + size = 100 + iops = 1000 + }, { + mount_path = "/mnt/data" + size = 100 + iops = 1000 + }] + description = "Custom file shares to access shared storage" +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + client = string + gklm = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Auth Variables +############################################################################## +# variable "enable_ldap" { +# type = bool +# default = false +# description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." +# } + +# variable "ldap_basedns" { +# type = string +# default = "ldapscale.com" +# description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." +# } + +# variable "ldap_server" { +# type = string +# default = null +# description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." +# } + +# variable "ldap_admin_password" { +# type = string +# sensitive = true +# default = "hpc@IBMCloud" +# description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters." +# } + +# variable "ldap_user_name" { +# type = string +# default = "admin" +# description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters." +# } + +# variable "ldap_user_password" { +# type = string +# sensitive = true +# default = "hpc@IBMCloud" +# description = "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic character." +# } + +# variable "ldap_ssh_keys" { +# type = list(string) +# default = null +# description = "Name of the SSH key configured in your IBM Cloud account that is used to establish a connection to the LDAP Server." +# } + +# variable "ldap_instances" { +# type = list( +# object({ +# profile = string +# count = number +# image = string +# }) +# ) +# default = [{ +# profile = "bx2-2x8" +# count = 0 +# image = "ibm-redhat-8-10-minimal-amd64-2" +# }] +# description = "Number of instances to be launched for ldap hosts." +# } + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "hpcs_instance_name" { + type = string + default = null + description = "Hyper Protect Crypto Service instance" +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_atracker" { + type = bool + default = true + description = "Enable Activity tracker" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "Enable Activity tracker" +} + +############################################################################## +# Scale specific Variables +############################################################################## +# variable "filesystem_config" { +# type = list(object({ +# filesystem = string +# block_size = string +# default_data_replica = number +# default_metadata_replica = number +# max_data_replica = number +# max_metadata_replica = number +# mount_point = string +# })) +# default = null +# description = "File system configurations." +# } + +# variable "filesets_config" { +# type = list(object({ +# fileset = string +# filesystem = string +# junction_path = string +# client_mount_path = string +# quota = number +# })) +# default = null +# description = "Fileset configurations." +# } + +# variable "afm_instances" { +# type = list( +# object({ +# profile = string +# count = number +# image = string +# }) +# ) +# default = [{ +# profile = "bx2-2x8" +# count = 0 +# image = "ibm-redhat-8-10-minimal-amd64-2" +# }] +# description = "Number of instances to be launched for afm hosts." +# } + +# variable "afm_cos_config" { +# type = list(object({ +# afm_fileset = string, +# mode = string, +# cos_instance = string, +# bucket_name = string, +# bucket_region = string, +# cos_service_cred_key = string, +# bucket_type = string, +# bucket_storage_class = string +# })) +# default = null +# description = "AFM configurations." +# } + +############################################################################## +# LSF specific Variables +############################################################################## +# variable "cluster_name" { +# type = string +# default = "HPCCluster" +# description = "Unique ID of the cluster used by LSF for configuration of resources. This can be up to 39 alphanumeric characters." +# validation { +# condition = 0 < length(var.cluster_name) && length(var.cluster_name) < 40 && can(regex("^[a-zA-Z0-9_.-]+$", var.cluster_name)) +# error_message = "The ID can be up to 39 alphanumeric characters including the underscore (_), the hyphen (-), and the period (.) characters." +# } +# } + +# variable "enable_hyperthreading" { +# type = bool +# default = true +# description = "Setting this to true will enable hyper-threading in the worker nodes of the cluster (default). Otherwise, hyper-threading will be disabled." +# } + +# variable "enable_dedicated_host" { +# type = bool +# default = false +# description = "Set to true to use dedicated hosts for compute hosts (default: false)." +# } + +# variable "dedicated_host_placement" { +# type = string +# default = "spread" +# description = "Specify 'pack' or 'spread'. The 'pack' option will deploy VSIs on one dedicated host until full before moving on to the next dedicated host." +# validation { +# condition = var.dedicated_host_placement == "spread" || var.dedicated_host_placement == "pack" +# error_message = "Supported values for dedicated_host_placement: spread or pack." +# } +# } + +# variable "enable_app_center" { +# type = bool +# default = false +# description = "Set to true to install and enable use of the IBM Spectrum LSF Application Center GUI." +# } + +# variable "app_center_gui_password" { +# type = string +# default = "hpc@IBMCloud" +# sensitive = true +# description = "Password for IBM Spectrum LSF Application Center GUI." +# } + +# variable "app_center_db_password" { +# type = string +# default = "hpc@IBMCloud" +# sensitive = true +# description = "Password for IBM Spectrum LSF Application Center database GUI." +# } + +############################################################################## +# Symphony specific Variables +############################################################################## + +############################################################################## +# Slurm specific Variables +############################################################################## + +############################################################################## +# Landing Zone Variables +############################################################################## +# variable "clusters" { +# default = null +# description = "A list describing clusters workloads to create" +# type = list( +# object({ +# name = string # Name of Cluster +# vpc_name = string # Name of VPC +# subnet_names = list(string) # List of vpc subnets for cluster +# workers_per_subnet = number # Worker nodes per subnet. +# machine_type = string # Worker node flavor +# kube_type = string # iks or openshift +# kube_version = optional(string) # Can be a version from `ibmcloud ks versions` or `default` +# entitlement = optional(string) # entitlement option for openshift +# secondary_storage = optional(string) # Secondary storage type +# pod_subnet = optional(string) # Portable subnet for pods +# service_subnet = optional(string) # Portable subnet for services +# existing_resource_group = string # Resource Group used for cluster +# cos_name = optional(string) # Name of COS instance Required only for OpenShift clusters +# access_tags = optional(list(string), []) +# boot_volume_crk_name = optional(string) # Boot volume encryption key name +# disable_public_endpoint = optional(bool, true) # disable cluster public, leaving only private endpoint +# disable_outbound_traffic_protection = optional(bool, false) # public outbound access from the cluster workers +# cluster_force_delete_storage = optional(bool, false) # force the removal of persistent storage associated with the cluster during cluster deletion +# operating_system = string # The operating system of the workers in the default worker pool. See https://cloud.ibm.com/docs/openshift?topic=openshift-openshift_versions#openshift_versions_available . +# kms_wait_for_apply = optional(bool, true) # make terraform wait until KMS is applied to master and it is ready and deployed +# verify_cluster_network_readiness = optional(bool, true) # Flag to run a script will run kubectl commands to verify that all worker nodes can communicate successfully with the master. If the runtime does not have access to the kube cluster to run kubectl commands, this should be set to false. +# use_ibm_cloud_private_api_endpoints = optional(bool, true) # Flag to force all cluster related api calls to use the IBM Cloud private endpoints. +# import_default_worker_pool_on_create = optional(bool) # (Advanced users) Whether to handle the default worker pool as a stand-alone ibm_container_vpc_worker_pool resource on cluster creation. Only set to false if you understand the implications of managing the default worker pool as part of the cluster resource. Set to true to import the default worker pool as a separate resource. Set to false to manage the default worker pool as part of the cluster resource. +# allow_default_worker_pool_replacement = optional(bool) # (Advanced users) Set to true to allow the module to recreate a default worker pool. Only use in the case where you are getting an error indicating that the default worker pool cannot be replaced on apply. Once the default worker pool is handled as a stand-alone ibm_container_vpc_worker_pool, if you wish to make any change to the default worker pool which requires the re-creation of the default pool set this variable to true +# labels = optional(map(string)) # A list of labels that you want to add to the default worker pool. +# addons = optional(object({ # Map of OCP cluster add-on versions to install +# debug-tool = optional(string) +# image-key-synchronizer = optional(string) +# openshift-data-foundation = optional(string) +# vpc-file-csi-driver = optional(string) +# static-route = optional(string) +# cluster-autoscaler = optional(string) +# vpc-block-csi-driver = optional(string) +# ibm-storage-operator = optional(string) +# }), {}) +# manage_all_addons = optional(bool, false) # Instructs Terraform to manage all cluster addons, even if addons were installed outside of the module. If set to 'true' this module will destroy any addons that were installed by other sources. +# kms_config = optional( +# object({ +# crk_name = string # Name of key +# private_endpoint = optional(bool) # Private endpoint +# }) +# ) +# worker_pools = optional( +# list( +# object({ +# name = string # Worker pool name +# vpc_name = string # VPC name +# workers_per_subnet = number # Worker nodes per subnet +# flavor = string # Worker node flavor +# subnet_names = list(string) # List of vpc subnets for worker pool +# entitlement = optional(string) # entitlement option for openshift +# secondary_storage = optional(string) # Secondary storage type +# boot_volume_crk_name = optional(string) # Boot volume encryption key name +# operating_system = string # The operating system of the workers in the worker pool. See https://cloud.ibm.com/docs/openshift?topic=openshift-openshift_versions#openshift_versions_available . +# labels = optional(map(string)) # A list of labels that you want to add to all the worker nodes in the worker pool. +# }) +# ) +# ) +# }) +# ) +# } + +############################################################################## +# Terraform generic Variables +############################################################################## +# tflint-ignore: all +# variable "TF_PARALLELISM" { +# type = string +# default = "250" +# description = "Limit the number of concurrent operation." +# } + +# tflint-ignore: all +# variable "TF_VERSION" { +# type = string +# default = "1.9" +# description = "The version of the Terraform engine that's used in the Schematics workspace." +# } + +# tflint-ignore: all +# variable "TF_LOG" { +# type = string +# default = "ERROR" +# description = "The Terraform log level used for output in the Schematics workspace." +# } + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} diff --git a/solutions/custom/version.tf b/solutions/custom/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/custom/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/solutions/hpc/README.md b/solutions/hpc/README.md deleted file mode 100644 index c91d57d8..00000000 --- a/solutions/hpc/README.md +++ /dev/null @@ -1,26 +0,0 @@ -IBM Cloud HPC is a deployable architecture where you can deploy both HPC scheduling software and compute clusters to run and manage your compute-intensive HPC workloads. You can reserve capacity on a recurring hourly basis from a dedicated IBM Cloud resource pool where the clusters of VPC virtual server instances can be provisioned. - -## Before you begin -### Configure your IBM Cloud settings -Set up and configure an IBM Cloud account. For more information, click [here](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-before-you-begin-deploying#confirm-cloud-settings) -### Generate an API key -Generate an API key for your IBM Cloud account where the IBM Cloud HPC cluster will be deployed. For more information, click [here](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-before-you-begin-deploying#set-IAM-permissions). -### Create an SSH key -Create an SSH key in your IBM Cloud account. This is your SSH key that you will use to access the IBM Cloud HPC cluster. For more information, click [here](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-before-you-begin-deploying#create-ssh-key). - -## Required resources -### Understand the cluster configuration -Familiarize yourself with the IBM Cloud HPC deployable architecture:
https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-ibm-cloud-hpc - -## Deploying the environment -Deploy the IBM Cloud HPC cluster:
https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-deploy-architecture - -## Cleaning up deployed environments -If you no longer need your deployed IBM Cloud HPC cluster, you can clean it up from your environment: 
https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-cleaning-up-deployed-environments - -## Getting support - -This offering is provided and supported by [IBM Corporation](https://www.ibm.com/mysupport). To get help with the IBM Cloud HPC offering please contact IBM HPC Cloud support using: -* URL https://www.ibm.com/mysupport -* Call IBM support - country-based numbers listed [here](https://www.ibm.com/planetwide). -IBM Cloud HPC support is provided 24x7x365 for Severity 1 issues and during client business hours (8 AM to 5 PM) for Severity 2, 3 and 4 issues. diff --git a/solutions/hpc/catalogValidationValues.json.template b/solutions/hpc/catalogValidationValues.json.template deleted file mode 100644 index 9f1867b8..00000000 --- a/solutions/hpc/catalogValidationValues.json.template +++ /dev/null @@ -1,9 +0,0 @@ -{ - "ibmcloud_api_key": $VALIDATION_APIKEY, - "existing_resource_group": $RG_NAME, - "cluster_name": "HPC-LSF-1", - "bastion_ssh_keys": "[\"geretain-hpc\"]", - "compute_ssh_keys": "[\"geretain-hpc\"]", - "remote_allowed_ips": "[\"129.41.58.5\"]", - "zones": "[\"us-east-1\"]" -} diff --git a/solutions/hpc/datasource.tf b/solutions/hpc/datasource.tf deleted file mode 100644 index 6dd317f8..00000000 --- a/solutions/hpc/datasource.tf +++ /dev/null @@ -1,96 +0,0 @@ -data "ibm_is_region" "region" { - name = local.region -} - -data "ibm_is_vpc" "itself" { - count = var.vpc_name == null ? 0 : 1 - name = var.vpc_name -} - -locals { - vpc_name = var.vpc_name == null ? one(module.landing_zone[0].vpc_name) : var.vpc_name - # region_name = [for zone in var.zones : join("-", slice(split("-", zone), 0, 2))][0] - api_endpoint_region_map = { - "us-east" = "https://api.us-east.codeengine.cloud.ibm.com/v2beta" - "eu-de" = "https://api.eu-de.codeengine.cloud.ibm.com/v2beta" - "us-south" = "https://api.us-south.codeengine.cloud.ibm.com/v2beta" - } - ldap_server_status = var.enable_ldap == true && var.ldap_server == "null" ? false : true - - # Decode the JSON reply got from the Code Engine API - # https://hpc-api..codeengine.cloud.ibm.com/v3/capacity_reservations - # Verify if in the capacity_reservations list there is one with the name equal to the Contract ID. - reservation_id_found = try(length([for res in local.reservation_data.capacity_reservations : res if res.name == var.reservation_id]), 0) > 0 - # Verify if the status code is 200 - reservation_data = var.solution == "hpc" ? jsondecode(data.http.reservation_id_validation[0].response_body) : null - valid_status_code = var.solution == "hpc" ? contains(["200"], tostring(data.http.reservation_id_validation[0].status_code)) : false - -} - -data "ibm_is_vpc" "existing_vpc" { - # Lookup for this VPC resource only if var.vpc_name is not empty - count = var.vpc_name != null ? 1 : 0 - name = var.vpc_name -} - -data "ibm_is_vpc" "vpc" { - name = local.vpc_name - # Depends on creation of new VPC or look up of existing VPC based on value of var.vpc_name, - depends_on = [module.landing_zone.vpc_name, data.ibm_is_vpc.existing_vpc] -} - -data "ibm_is_subnet" "existing_subnet" { - # Lookup for this Subnet resources only if var.cluster_subnet_ids is not empty - count = (length(var.cluster_subnet_ids) == 1 && var.vpc_name != null) ? length(var.cluster_subnet_ids) : 0 - identifier = var.cluster_subnet_ids[count.index] -} - -data "ibm_is_subnet" "existing_login_subnet" { - # Lookup for this Subnet resources only if var.login_subnet_id is not empty - count = (var.login_subnet_id != null && var.vpc_name != null) ? 1 : 0 - identifier = var.login_subnet_id -} - -# Validating Contract ID -data "ibm_iam_auth_token" "auth_token" {} - -data "http" "reservation_id_validation" { - count = var.solution == "hpc" ? 1 : 0 - url = "${local.api_endpoint_region_map[local.region]}/capacity_reservations" - method = "GET" - request_headers = { - Accept = "application/json" - Authorization = data.ibm_iam_auth_token.auth_token.iam_access_token - # Content-Type = "application/json" - } -} - -# Code for Public Gateway attachment for the existing vpc and new subnets scenario - -data "ibm_is_public_gateways" "public_gateways" { -} - -locals { - public_gateways_list = data.ibm_is_public_gateways.public_gateways.public_gateways - zone_1_pgw_ids = var.vpc_name != null ? [for gateway in local.public_gateways_list : gateway.id if gateway.vpc == local.vpc_id && gateway.zone == var.zones[0]] : [] -} - -resource "ibm_is_subnet_public_gateway_attachment" "zone_1_attachment" { - count = (var.vpc_name != null && length(var.cluster_subnet_ids) == 0) ? 1 : 0 - subnet = local.compute_subnets[0].id - public_gateway = length(local.zone_1_pgw_ids) > 0 ? local.zone_1_pgw_ids[0] : "" -} - -data "ibm_is_dedicated_host_profiles" "worker" { - count = var.enable_dedicated_host ? 1 : 0 -} - -data "ibm_is_ssh_key" "bastion" { - for_each = toset(var.bastion_ssh_keys) - name = each.key -} - -data "ibm_is_ssh_key" "compute" { - for_each = toset(var.compute_ssh_keys) - name = each.key -} diff --git a/solutions/hpc/input_validation.tf b/solutions/hpc/input_validation.tf deleted file mode 100644 index 94027282..00000000 --- a/solutions/hpc/input_validation.tf +++ /dev/null @@ -1,232 +0,0 @@ -################################################### -# Copyright (C) IBM Corp. 2023 All Rights Reserved. -# Licensed under the Apache License v2.0 -################################################### - -# This file contains the complete information on all the validations performed from the code during the generate plan process -# Validations are performed to make sure, the appropriate error messages are displayed to user in-order to provide required input parameter - -locals { - # validation for the boot volume encryption toggling. - validate_enable_customer_managed_encryption = anytrue([alltrue([var.kms_key_name != null, var.kms_instance_name != null]), (var.kms_key_name == null), (var.key_management != "key_protect")]) - validate_enable_customer_managed_encryption_msg = "Please make sure you are passing the kms_instance_name if you are passing kms_key_name." - # tflint-ignore: terraform_unused_declarations - validate_enable_customer_managed_encryption_chk = regex( - "^${local.validate_enable_customer_managed_encryption_msg}$", - (local.validate_enable_customer_managed_encryption ? local.validate_enable_customer_managed_encryption_msg : "")) - - # validation for the boot volume encryption toggling. - validate_null_customer_managed_encryption = anytrue([alltrue([var.kms_instance_name == null, var.key_management != "key_protect"]), (var.key_management == "key_protect")]) - validate_null_customer_managed_encryption_msg = "Please make sure you are setting key_management as key_protect if you are passing kms_instance_name, kms_key_name." - # tflint-ignore: terraform_unused_declarations - validate_null_customer_managed_encryption_chk = regex( - "^${local.validate_null_customer_managed_encryption_msg}$", - (local.validate_null_customer_managed_encryption ? local.validate_null_customer_managed_encryption_msg : "")) - - # validate application center gui password - password_msg = "Password should be at least 8 characters, must have one number, one lowercase letter, and one uppercase letter, at least one unique character. Password Should not contain username" - validate_app_center_gui_pwd = (var.enable_app_center && can(regex("^.{8,}$", var.app_center_gui_pwd) != "") && can(regex("[0-9]{1,}", var.app_center_gui_pwd) != "") && can(regex("[a-z]{1,}", var.app_center_gui_pwd) != "") && can(regex("[A-Z]{1,}", var.app_center_gui_pwd) != "") && can(regex("[!@#$%^&*()_+=-]{1,}", var.app_center_gui_pwd) != "") && trimspace(var.app_center_gui_pwd) != "") || !var.enable_app_center - # tflint-ignore: terraform_unused_declarations - validate_app_center_gui_pwd_chk = regex( - "^${local.password_msg}$", - (local.validate_app_center_gui_pwd ? local.password_msg : "")) - - # Validate existing cluster subnet should be the subset of vpc_name entered - validate_subnet_id_vpc_msg = "Provided cluster subnets should be within the vpc entered." - validate_subnet_id_vpc = anytrue([length(var.cluster_subnet_ids) == 0, length(var.cluster_subnet_ids) == 1 && var.vpc_name != null ? alltrue([for subnet_id in var.cluster_subnet_ids : contains(data.ibm_is_vpc.existing_vpc[0].subnets[*].id, subnet_id)]) : false]) - # tflint-ignore: terraform_unused_declarations - validate_subnet_id_vpc_chk = regex("^${local.validate_subnet_id_vpc_msg}$", - (local.validate_subnet_id_vpc ? local.validate_subnet_id_vpc_msg : "")) - - # Validate existing cluster subnet should be in the appropriate zone. - validate_subnet_id_zone_msg = "Provided cluster subnets should be in appropriate zone." - validate_subnet_id_zone = anytrue([length(var.cluster_subnet_ids) == 0, length(var.cluster_subnet_ids) == 1 && var.vpc_name != null ? alltrue([data.ibm_is_subnet.existing_subnet[0].zone == var.zones[0]]) : false]) - # tflint-ignore: terraform_unused_declarations - validate_subnet_id_zone_chk = regex("^${local.validate_subnet_id_zone_msg}$", - (local.validate_subnet_id_zone ? local.validate_subnet_id_zone_msg : "")) - - # Validate existing login subnet should be the subset of vpc_name entered - validate_login_subnet_id_vpc_msg = "Provided login subnet should be within the vpc entered." - validate_login_subnet_id_vpc = anytrue([var.login_subnet_id == null, var.login_subnet_id != null && var.vpc_name != null ? alltrue([for subnet_id in [var.login_subnet_id] : contains(data.ibm_is_vpc.existing_vpc[0].subnets[*].id, subnet_id)]) : false]) - # tflint-ignore: terraform_unused_declarations - validate_login_subnet_id_vpc_chk = regex("^${local.validate_login_subnet_id_vpc_msg}$", - (local.validate_login_subnet_id_vpc ? local.validate_login_subnet_id_vpc_msg : "")) - - # Validate existing login subnet should be in the appropriate zone. - validate_login_subnet_id_zone_msg = "Provided login subnet should be in appropriate zone." - validate_login_subnet_id_zone = anytrue([var.login_subnet_id == null, var.login_subnet_id != null && var.vpc_name != null ? alltrue([data.ibm_is_subnet.existing_login_subnet[0].zone == var.zones[0]]) : false]) - # tflint-ignore: terraform_unused_declarations - validate_login_subnet_id_zone_chk = regex("^${local.validate_login_subnet_id_zone_msg}$", - (local.validate_login_subnet_id_zone ? local.validate_login_subnet_id_zone_msg : "")) - - # Contract ID validation - # validate_reservation_id = length("${var.cluster_id}${var.reservation_id}") > 129 ? false : true - # validate_reservation_id_msg = "The length of reservation_id and cluster_id combination should not exceed 128 characters." - # # tflint-ignore: terraform_unused_declarations - # validate_reservation_id_chk = regex( - # "^${local.validate_reservation_id_msg}$", - # (local.validate_reservation_id ? local.validate_reservation_id_msg : "")) - - validate_reservation_id_api = var.solution == "hpc" ? local.valid_status_code && local.reservation_id_found : true - validate_reservation_id_api_msg = "The provided reservation id doesn't have a valid reservation or the reservation id is not on the same account as HPC deployment." - # tflint-ignore: terraform_unused_declarations - validate_reservation_id_api_chk = regex( - "^${local.validate_reservation_id_api_msg}$", - (local.validate_reservation_id_api ? local.validate_reservation_id_api_msg : "")) - - validate_worker_count = var.solution == "lsf" ? local.total_worker_node_count <= var.worker_node_max_count : true - validate_worker_error_msg = "If the solution is set as lsf, the worker min count cannot be greater than worker max count." - # tflint-ignore: terraform_unused_declarations - validate_worker_count_chk = regex( - "^${local.validate_worker_error_msg}$", - (local.validate_worker_count ? local.validate_worker_error_msg : "")) - - # Validate custom fileshare - # Construct a list of Share size(GB) and IOPS range(IOPS)from values provided in https://cloud.ibm.com/docs/vpc?topic=vpc-file-storage-profiles&interface=ui#dp2-profile - # List values [[sharesize_start,sharesize_end,min_iops,max_iops], [..]....] - custom_fileshare_iops_range = [[10, 39, 100, 1000], [40, 79, 100, 2000], [80, 99, 100, 4000], [100, 499, 100, 6000], [500, 999, 100, 10000], [1000, 1999, 100, 20000], [2000, 3999, 200, 40000], [4000, 7999, 300, 40000], [8000, 15999, 500, 64000], [16000, 32000, 2000, 96000]] - # List with input iops value, min and max iops for the input share size. - size_iops_lst = [for values in var.custom_file_shares : [for list_val in local.custom_fileshare_iops_range : [values.size != null ? (values.iops != null ? (values.size >= list_val[0] && values.size <= list_val[1] ? values.iops : null) : null) : null, list_val[2], list_val[3]] if values.size != null]] - validate_custom_file_share = alltrue([for iops in local.size_iops_lst : (length(iops) > 0 ? (iops[0][0] != null ? (iops[0][0] >= iops[0][1] && iops[0][0] <= iops[0][2]) : true) : true)]) - # Validate the input iops falls inside the range. - # validate_custom_file_share = alltrue([for iops in local.size_iops_lst : iops[0][0] >= iops[0][1] && iops[0][0] <= iops[0][2]]) - validate_custom_file_share_msg = "Provided iops value is not valid for given file share size. Please refer 'File Storage for VPC profiles' page in ibm cloud docs for a valid iops and file share size combination." - # tflint-ignore: terraform_unused_declarations - validate_custom_file_share_chk = regex( - "^${local.validate_custom_file_share_msg}$", - (local.validate_custom_file_share ? local.validate_custom_file_share_msg : "")) - - # LDAP base DNS Validation - validate_ldap_basedns = (var.enable_ldap && trimspace(var.ldap_basedns) != "") || !var.enable_ldap - ldap_basedns_msg = "If LDAP is enabled, then the base DNS should not be empty or null. Need a valid domain name." - # tflint-ignore: terraform_unused_declarations - validate_ldap_basedns_chk = regex( - "^${local.ldap_basedns_msg}$", - (local.validate_ldap_basedns ? local.ldap_basedns_msg : "")) - - # LDAP base existing LDAP server - validate_ldap_server = (var.enable_ldap && trimspace(var.ldap_server) != "") || !var.enable_ldap - ldap_server_msg = "IP of existing LDAP server. If none given a new ldap server will be created. It should not be empty." - # tflint-ignore: terraform_unused_declarations - validate_ldap_server_chk = regex( - "^${local.ldap_server_msg}$", - (local.validate_ldap_server ? local.ldap_server_msg : "")) - - # Existing LDAP server cert validation - validate_ldap_server_cert = ( - (trimspace(var.ldap_server) != "" && trimspace(var.ldap_server_cert) != "" && trimspace(var.ldap_server_cert) != "null") || - trimspace(var.ldap_server) == "null" || - !var.enable_ldap - ) - ldap_server_cert_msg = "Provide the current LDAP server certificate. This is required if 'ldap_server' is not set to 'null'; otherwise, the LDAP configuration will not succeed." - # tflint-ignore: terraform_unused_declarations - validate_ldap_server_cert_chk = regex( - "^${local.ldap_server_cert_msg}$", - local.validate_ldap_server_cert ? local.ldap_server_cert_msg : "" - ) - - # LDAP Admin Password Validation - validate_ldap_adm_pwd = var.enable_ldap && var.ldap_server == "null" ? (length(var.ldap_admin_password) >= 8 && length(var.ldap_admin_password) <= 20 && can(regex("^(.*[0-9]){2}.*$", var.ldap_admin_password))) && can(regex("^(.*[A-Z]){1}.*$", var.ldap_admin_password)) && can(regex("^(.*[a-z]){1}.*$", var.ldap_admin_password)) && can(regex("^.*[~@_+:].*$", var.ldap_admin_password)) && can(regex("^[^!#$%^&*()=}{\\[\\]|\\\"';?.<,>-]+$", var.ldap_admin_password)) : local.ldap_server_status - ldap_adm_password_msg = "Password that is used for LDAP admin. The password must contain at least 8 characters and at most 20 characters. For a strong password, at least three alphabetic characters are required, with at least one uppercase and one lowercase letter. Two numbers, and at least one special character. Make sure that the password doesn't include the username." - # tflint-ignore: terraform_unused_declarations - validate_ldap_adm_pwd_chk = regex( - "^${local.ldap_adm_password_msg}$", - (local.validate_ldap_adm_pwd ? local.ldap_adm_password_msg : "")) - - # LDAP User Validation - validate_ldap_usr = var.enable_ldap && var.ldap_server == "null" ? (length(var.ldap_user_name) >= 4 && length(var.ldap_user_name) <= 32 && var.ldap_user_name != "" && can(regex("^[a-zA-Z0-9_-]*$", var.ldap_user_name)) && trimspace(var.ldap_user_name) != "") : local.ldap_server_status - ldap_usr_msg = "The input for 'ldap_user_name' is considered invalid. The username must be within the range of 4 to 32 characters and may only include letters, numbers, hyphens, and underscores. Spaces are not permitted." - # tflint-ignore: terraform_unused_declarations - validate_ldap_usr_chk = regex( - "^${local.ldap_usr_msg}$", - (local.validate_ldap_usr ? local.ldap_usr_msg : "")) - - # LDAP User Password Validation - validate_ldap_usr_pwd = var.enable_ldap && var.ldap_server == "null" ? (length(var.ldap_user_password) >= 8 && length(var.ldap_user_password) <= 20 && can(regex("^(.*[0-9]){2}.*$", var.ldap_user_password))) && can(regex("^(.*[A-Z]){1}.*$", var.ldap_user_password)) && can(regex("^(.*[a-z]){1}.*$", var.ldap_user_password)) && can(regex("^.*[~@_+:].*$", var.ldap_user_password)) && can(regex("^[^!#$%^&*()=}{\\[\\]|\\\"';?.<,>-]+$", var.ldap_user_password)) : local.ldap_server_status - ldap_usr_password_msg = "Password that is used for LDAP user. The password must contain at least 8 characters and at most 20 characters. For a strong password, at least three alphabetic characters are required, with at least one uppercase and one lowercase letter. Two numbers, and at least one special character. Make sure that the password doesn't include the username." - # tflint-ignore: terraform_unused_declarations - validate_ldap_usr_pwd_chk = regex( - "^${local.ldap_usr_password_msg}$", - (local.validate_ldap_usr_pwd ? local.ldap_usr_password_msg : "")) - - # Validate existing subnet public gateways - validate_subnet_name_pg_msg = "Provided existing cluster_subnet_ids should have public gateway attached." - validate_subnet_name_pg = anytrue([length(var.cluster_subnet_ids) == 0, length(var.cluster_subnet_ids) == 1 && var.vpc_name != null ? (data.ibm_is_subnet.existing_subnet[0].public_gateway != "") : false]) - # tflint-ignore: terraform_unused_declarations - validate_subnet_name_pg_chk = regex("^${local.validate_subnet_name_pg_msg}$", - (local.validate_subnet_name_pg ? local.validate_subnet_name_pg_msg : "")) - - # Validate existing vpc public gateways - validate_existing_vpc_pgw_msg = "Provided existing vpc should have the public gateways created in the provided zones." - validate_existing_vpc_pgw = anytrue([(var.vpc_name == null), alltrue([var.vpc_name != null, length(var.cluster_subnet_ids) == 1]), alltrue([var.vpc_name != null, length(var.cluster_subnet_ids) == 0, var.login_subnet_id == null, length(local.zone_1_pgw_ids) > 0])]) - # tflint-ignore: terraform_unused_declarations - validate_existing_vpc_pgw_chk = regex("^${local.validate_existing_vpc_pgw_msg}$", - (local.validate_existing_vpc_pgw ? local.validate_existing_vpc_pgw_msg : "")) - - # Validate in case of existing subnets provide both login_subnet_id and cluster_subnet_ids. - validate_login_subnet_id_msg = "In case of existing subnets provide both login_subnet_id and cluster_subnet_ids." - validate_login_subnet_id = anytrue([alltrue([length(var.cluster_subnet_ids) == 0, var.login_subnet_id == null]), alltrue([length(var.cluster_subnet_ids) != 0, var.login_subnet_id != null])]) - # tflint-ignore: terraform_unused_declarations - validate_login_subnet_id_chk = regex("^${local.validate_login_subnet_id_msg}$", - (local.validate_login_subnet_id ? local.validate_login_subnet_id_msg : "")) - - # Validate the subnet_id user input value - validate_subnet_id_msg = "If the cluster_subnet_ids are provided, the user should also provide the vpc_name." - validate_subnet_id = anytrue([var.vpc_name != null && length(var.cluster_subnet_ids) > 0, length(var.cluster_subnet_ids) == 0]) - # tflint-ignore: terraform_unused_declarations - validate_subnet_id_chk = regex("^${local.validate_subnet_id_msg}$", - (local.validate_subnet_id ? local.validate_subnet_id_msg : "")) - - # Management node count validation when Application Center is in High Availability - validate_management_node_count = (var.enable_app_center && var.app_center_high_availability && var.management_node_count >= 2) || !var.app_center_high_availability || !var.enable_app_center - management_node_count_msg = "When the Application Center is installed in High Availability, at least two management nodes must be installed." - # tflint-ignore: terraform_unused_declarations - validate_management_node_count_chk = regex( - "^${local.management_node_count_msg}$", - (local.validate_management_node_count ? local.management_node_count_msg : "")) - - # IBM Cloud Application load Balancer CRN validation - validate_alb_crn = (var.enable_app_center && var.app_center_high_availability) && can(regex("^crn:v1:bluemix:public:secrets-manager:[a-zA-Z\\-]+:[a-zA-Z0-9\\-]+\\/[a-zA-Z0-9\\-]+:[a-fA-F0-9\\-]+:secret:[a-fA-F0-9\\-]+$", var.app_center_existing_certificate_instance)) || !var.app_center_high_availability || !var.enable_app_center - alb_crn_template_msg = "When app_center_high_availability is enable/set as true, The Application Center will be configured for high availability and requires a Application Load Balancer Front End listener to use a certificate CRN value stored in the Secret Manager. Provide the valid 'existing_certificate_instance' to configure the Application load balancer." - # tflint-ignore: terraform_unused_declarations - validate_alb_crn_chk = regex( - "^${local.alb_crn_template_msg}$", - (local.validate_alb_crn ? local.alb_crn_template_msg : "")) - - # Validate the dns_custom_resolver_id should not be given in case of new vpc case - validate_custom_resolver_id_msg = "If it is the new vpc deployment, do not provide existing dns_custom_resolver_id as that will impact the name resolution of the cluster." - validate_custom_resolver_id = anytrue([var.vpc_name != null, var.vpc_name == null && var.dns_custom_resolver_id == null]) - # tflint-ignore: terraform_unused_declarations - validate_custom_resolver_id_chk = regex("^${local.validate_custom_resolver_id_msg}$", - (local.validate_custom_resolver_id ? local.validate_custom_resolver_id_msg : "")) - - validate_reservation_id_new_msg = "Provided reservation id cannot be set as empty if the provided solution is set as hpc.." - validate_reservation_id_logic = var.solution == "hpc" ? var.reservation_id != null : true - # tflint-ignore: terraform_unused_declarations - validate_reservation_id_chk_new = regex("^${local.validate_reservation_id_new_msg}$", - (local.validate_reservation_id_logic ? local.validate_reservation_id_new_msg : "")) - - # IBM Cloud Monitoring validation - validate_observability_monitoring_enable_compute_nodes = (var.observability_monitoring_enable && var.observability_monitoring_on_compute_nodes_enable) || (var.observability_monitoring_enable && var.observability_monitoring_on_compute_nodes_enable == false) || (var.observability_monitoring_enable == false && var.observability_monitoring_on_compute_nodes_enable == false) - observability_monitoring_enable_compute_nodes_msg = "Please enable also IBM Cloud Monitoring to ingest metrics from Compute nodes" - # tflint-ignore: terraform_unused_declarations - observability_monitoring_enable_compute_nodes_chk = regex( - "^${local.observability_monitoring_enable_compute_nodes_msg}$", - (local.validate_observability_monitoring_enable_compute_nodes ? local.observability_monitoring_enable_compute_nodes_msg : "")) - - # Existing Bastion validation - validate_existing_bastion = var.existing_bastion_instance_name != null ? (var.existing_bastion_instance_public_ip != null && var.existing_bastion_security_group_id != null && var.existing_bastion_ssh_private_key != null) : local.bastion_instance_status - validate_existing_bastion_msg = "If bastion_instance_name is not null, then bastion_instance_public_ip, bastion_security_group_id, and bastion_ssh_private_key should not be null." - # tflint-ignore: terraform_unused_declarations - validate_existing_bastion_chk = regex( - "^${local.validate_existing_bastion_msg}$", - (local.validate_existing_bastion ? local.validate_existing_bastion_msg : "")) - - # Existing Storage security group validation - validate_existing_storage_sg = length([for share in var.custom_file_shares : { mount_path = share.mount_path, nfs_share = share.nfs_share } if share.nfs_share != null && share.nfs_share != ""]) > 0 ? var.storage_security_group_id != null ? true : false : true - validate_existing_storage_sg_msg = "Storage security group ID cannot be null when NFS share mount path is provided under cluster_file_shares variable." - # tflint-ignore: terraform_unused_declarations - validate_existing_storage_sg_chk = regex( - "^${local.validate_existing_storage_sg_msg}$", - (local.validate_existing_storage_sg ? local.validate_existing_storage_sg_msg : "")) -} diff --git a/solutions/hpc/locals.tf b/solutions/hpc/locals.tf deleted file mode 100644 index 9695e181..00000000 --- a/solutions/hpc/locals.tf +++ /dev/null @@ -1,306 +0,0 @@ -########################################################################### -locals { - # (overridable) switch to enable extra outputs (debugging) - print_extra_outputs = false - - # (overridable) switch to add the current (plan execution) IP to allowed CIDR list - add_current_ip_to_allowed_cidr = false - - # (overridable) list of extra entries for allowed CIDR list - remote_allowed_ips_extra = [] -} - -########################################################################### -# Local tweaks support -########################################################################### -# You can enable local tweaks files to customize your local deployment with things -# never intended to be included in the standard code. -# You can use override files to override some values of switches (see above) -# or you can force other values defined in the plan or include extra resources. -# -# See the directory "localtweak_examples" for more. - - -########################################################################### -########################################################################### -########################################################################### -# locals needed for landing_zone -locals { - # Region and Zone calculations - region = join("-", slice(split("-", var.zones[0]), 0, 2)) -} - -# locals needed for bootstrap -locals { - # dependency: landing_zone -> bootstrap - cos_data = module.landing_zone[0].cos_buckets_data - vpc_id = var.vpc_name == null ? one(module.landing_zone[0].vpc_id) : data.ibm_is_vpc.itself[0].id - vpc_cidr = join("", module.landing_zone[0].vpc_cidr) - bastion_subnets = length(var.cluster_subnet_ids) == 0 ? module.landing_zone[0].bastion_subnets : local.sorted_subnets - kms_encryption_enabled = var.key_management == "key_protect" ? true : false - boot_volume_encryption_key = var.key_management == "key_protect" ? one(module.landing_zone[0].boot_volume_encryption_key)["crn"] : null - existing_kms_instance_guid = var.key_management == "key_protect" ? module.landing_zone[0].key_management_guid : null - # cluster_id = local.region == "eu-de" || local.region == "us-east" || local.region == "us-south" ? var.cluster_id : "HPC-LSF-1" - total_worker_node_count = sum([for node in var.worker_node_instance_type : node.count]) -} - -# locals needed for landing_zone_vsi -locals { - bastion_security_group_id = module.bootstrap[0].bastion_security_group_id - bastion_public_key_content = module.bootstrap[0].bastion_public_key_content - bastion_private_key_content = module.bootstrap[0].bastion_private_key_content - compute_private_key_content = module.landing_zone_vsi[0].compute_private_key_content - - # dependency: landing_zone -> landing_zone_vsi - - subnets_output = module.landing_zone[0].subnets - - sorted_subnets = length(var.cluster_subnet_ids) != 0 ? [ - element(local.subnets_output, index(local.subnets_output[*].id, var.cluster_subnet_ids[0])), - element(local.subnets_output, index(local.subnets_output[*].id, var.login_subnet_id)) - ] : [] - - sorted_compute_subnets = length(var.cluster_subnet_ids) == 0 ? [ - element(module.landing_zone[0].compute_subnets, index(module.landing_zone[0].compute_subnets[*].zone, var.zones[0])) - ] : [] - - - compute_subnets = length(var.cluster_subnet_ids) == 0 ? local.sorted_compute_subnets : local.sorted_subnets -} - -# locals needed for file-storage -locals { - compute_ssh_keys = [for name in var.compute_ssh_keys : data.ibm_is_ssh_key.compute[name].id] - bastion_ssh_keys = [for name in var.bastion_ssh_keys : data.ibm_is_ssh_key.bastion[name].id] - # dependency: landing_zone_vsi -> file-share - compute_subnet_id = local.compute_subnets[0].id - compute_security_group_id = module.landing_zone_vsi[0].compute_sg_id - management_instance_count = var.management_node_count - - valid_lsf_shares = [ - for share in var.custom_file_shares : - { - mount_path = "/mnt/lsf" - nfs_share = share.nfs_share - } - if share.mount_path == "/mnt/lsf" && share.nfs_share != "" && share.nfs_share != null - ] - - valid_default_vpc_share = [ - for share in var.custom_file_shares : - { - mount_path = "/mnt/lsf" - size = share.size - iops = share.size - } - if share.mount_path == "/mnt/lsf" && share.size != null && share.iops != null - ] - - default_share = local.management_instance_count > 0 && length(local.valid_lsf_shares) == 0 && length(local.valid_default_vpc_share) == 0 ? [ - { - mount_path = "/mnt/lsf" - size = 100 - iops = 1000 - } - ] : [] - - vpc_file_share = [ - for share in var.custom_file_shares : - { - mount_path = share.mount_path - size = share.size - iops = share.iops - } - if share.size != null && share.iops != null && share.mount_path != "/mnt/lsf" - ] - - total_shares = concat(length(local.valid_default_vpc_share) == 1 ? local.valid_default_vpc_share : local.default_share, local.vpc_file_share) - - # total_shares = 10 - file_shares = [ - for count in range(length(local.total_shares)) : - { - name = format("%s-%s", var.cluster_prefix, element(split("/", local.total_shares[count]["mount_path"]), length(split("/", local.total_shares[count]["mount_path"])) - 1)) - size = local.total_shares[count]["size"] - iops = local.total_shares[count]["iops"] - } - ] -} - -# locals needed for DNS -locals { - # dependency: landing_zone -> DNS - resource_groups = { - service_rg = var.existing_resource_group == "null" ? module.landing_zone[0].resource_group_id[0]["${var.cluster_prefix}-service-rg"] : one(values(one(module.landing_zone[0].resource_group_id))) - workload_rg = var.existing_resource_group == "null" ? module.landing_zone[0].resource_group_id[0]["${var.cluster_prefix}-workload-rg"] : one(values(one(module.landing_zone[0].resource_group_id))) - } - vpc_crn = var.vpc_name == null ? one(module.landing_zone[0].vpc_crn) : one(data.ibm_is_vpc.itself[*].crn) - # TODO: Fix existing subnet logic - # subnets_crn = module.landing_zone.subnets_crn - compute_subnets_crn = length(var.cluster_subnet_ids) == 0 ? local.sorted_compute_subnets[*].crn : local.sorted_subnets[*].crn -} - -# locals needed for dns-records -locals { - # dependency: dns -> dns-records - dns_instance_id = module.dns.dns_instance_id - compute_dns_zone_id = one(flatten([ - for dns_zone in module.dns.dns_zone_maps : values(dns_zone) if one(keys(dns_zone)) == var.dns_domain_name["compute"] - ])) - - - # dependency: landing_zone_vsi -> dns-records - management_vsi_data = flatten(module.landing_zone_vsi[0].management_vsi_data) - management_private_ip = local.management_vsi_data[0]["ipv4_address"] - management_hostname = local.management_vsi_data[0]["name"] - - management_candidate_vsi_data = flatten(module.landing_zone_vsi[0].management_candidate_vsi_data) - management_candidate_private_ips = local.management_candidate_vsi_data[*]["ipv4_address"] - management_candidate_hostnames = local.management_candidate_vsi_data[*]["name"] - - login_vsi_data = flatten(module.landing_zone_vsi[0].login_vsi_data) - login_private_ips = local.login_vsi_data[*]["ipv4_address"] - login_hostnames = local.login_vsi_data[*]["name"] - - ldap_vsi_data = flatten(module.landing_zone_vsi[0].ldap_vsi_data) - ldap_private_ips = local.ldap_vsi_data[*]["ipv4_address"] - ldap_hostnames = local.ldap_vsi_data[*]["name"] - - worker_vsi_data = flatten(module.landing_zone_vsi[0].worker_vsi_data) - worker_private_ips = local.worker_vsi_data[*]["ipv4_address"] - - compute_dns_records = [ - for instance in local.management_vsi_data : - { - name = instance["name"] - rdata = instance["ipv4_address"] - } - ] - mgmt_candidate_dns_records = [ - for instance in local.management_candidate_vsi_data : - { - name = instance["name"] - rdata = instance["ipv4_address"] - } - ] - login_vsi_dns_records = [ - for instance in local.login_vsi_data : - { - name = instance["name"] - rdata = instance["ipv4_address"] - } - ] - ldap_vsi_dns_records = [ - for instance in local.ldap_vsi_data : - { - name = instance["name"] - rdata = instance["ipv4_address"] - } - ] - worker_vsi_dns_records = [ - for instance in local.worker_vsi_data : - { - name = instance["name"] - rdata = instance["ipv4_address"] - } - ] -} - - -# locals needed for inventory -locals { - compute_hosts = concat(local.management_vsi_data[*]["ipv4_address"], local.management_candidate_vsi_data[*]["ipv4_address"]) - compute_inventory_path = "compute.ini" - bastion_inventory_path = "bastion.ini" - login_inventory_path = "login.ini" - ldap_inventory_path = "ldap.ini" - worker_inventory_path = "worker.ini" - - bastion_host = local.bastion_instance_public_ip != null ? [local.bastion_instance_public_ip] : var.enable_fip ? [local.bastion_fip, local.bastion_primary_ip] : [local.bastion_primary_ip, ""] - login_host = local.login_private_ips - ldap_host = local.ldap_private_ips - worker_host = local.worker_private_ips - - cloud_logs_ingress_private_endpoint = module.cloud_monitoring_instance_creation.cloud_logs_ingress_private_endpoint -} - -# locals needed for playbook -locals { - cluster_user = "lsfadmin" - login_user = "ubuntu" - ldap_user = "ubuntu" - bastion_primary_ip = module.bootstrap[0].bastion_primary_ip - bastion_fip = module.bootstrap[0].bastion_fip[0] - bastion_fip_id = module.bootstrap[0].bastion_fip_id - no_addr_prefix = true -} - -locals { - share_path = length(local.valid_lsf_shares) > 0 ? join(", ", local.valid_lsf_shares[*].nfs_share) : module.file_storage.mount_path_1 -} - -########################################################################### -# IBM Cloud Dababase for MySQL local variables -########################################################################### -locals { - mysql_version = "8.0" - db_service_endpoints = "private" - db_template = [3, 12288, 122880, 3, "multitenant"] -} - -########################################################################### -# IBM Application Load Balancer variables -########################################################################### - -locals { - # alb_created_by_api: - # - true -> use ibmcloud API - # - false -> use ibmcloud terraform provider (not recommended, dramatically slower) - alb_created_by_api = true - alb_hostname = local.alb_created_by_api ? module.alb_api.alb_hostname : module.alb.alb_hostname -} - -locals { - vsi_management_ids = [ - for instance in concat(local.management_vsi_data, local.management_candidate_vsi_data) : - { - id = instance["id"] - } - ] -} - -# locals needed for ssh connection -locals { - ssh_forward_host = (var.app_center_high_availability ? "pac.${var.dns_domain_name.compute}" : local.management_private_ip) - ssh_forwards = "-L 8443:${local.ssh_forward_host}:8443 -L 6080:${local.ssh_forward_host}:6080 -L 8444:${local.ssh_forward_host}:8444" - ssh_jump_host = local.bastion_instance_public_ip != null ? local.bastion_instance_public_ip : var.enable_fip ? module.bootstrap[0].bastion_fip[0] : module.bootstrap[0].bastion_primary_ip - ssh_jump_option = "-J ubuntu@${local.ssh_jump_host}" - ssh_cmd = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 ${local.ssh_forwards} ${local.ssh_jump_option} lsfadmin@${join(",", local.login_private_ips)}" -} - -# Existing bastion Variables -locals { - # bastion_instance_name = var.bastion_instance_name != null ? var.bastion_instance_name : null - bastion_instance_public_ip = var.existing_bastion_instance_name != null ? var.existing_bastion_instance_public_ip : null - # bastion_security_group_id = var.bastion_instance_name != null ? var.bastion_security_group_id : module.bootstrap.bastion_security_group_id - bastion_ssh_private_key = var.existing_bastion_instance_name != null ? var.existing_bastion_ssh_private_key : null - bastion_instance_status = var.existing_bastion_instance_name != null ? false : true - existing_subnet_cidrs = var.vpc_name != null && length(var.cluster_subnet_ids) == 1 ? [data.ibm_is_subnet.existing_subnet[0].ipv4_cidr_block, data.ibm_is_subnet.existing_login_subnet[0].ipv4_cidr_block, local.vpc_cidr] : [] -} - -#################################################### -# The code below does some internal processing of variables and locals -# (e.g. concatenating lists). - -locals { - allowed_cidr = concat(var.remote_allowed_ips, local.remote_allowed_ips_extra, local.add_current_ip_to_allowed_cidr ? module.my_ip.my_cidr : []) -} - -locals { - profile_str = split("-", var.worker_node_instance_type[0].instance_type) - dh_profiles = var.enable_dedicated_host ? [ - for p in data.ibm_is_dedicated_host_profiles.worker[0].profiles : p if p.class == local.profile_str[0] - ] : [] - dh_profile_index = length(local.dh_profiles) == 0 ? "Profile class ${local.profile_str[0]} for dedicated hosts does not exist in ${local.region}.Check available class with `ibmcloud target -r ${local.region}; ibmcloud is dedicated-host-profiles` and retry with another worker_node_instance_type." : 0 - dh_profile = var.enable_dedicated_host ? local.dh_profiles[local.dh_profile_index] : null -} diff --git a/solutions/hpc/localtweak_examples/.gitignore b/solutions/hpc/localtweak_examples/.gitignore deleted file mode 100644 index e094084d..00000000 --- a/solutions/hpc/localtweak_examples/.gitignore +++ /dev/null @@ -1 +0,0 @@ -!localtweak__* diff --git a/solutions/hpc/localtweak_examples/README.md b/solutions/hpc/localtweak_examples/README.md deleted file mode 100644 index 85c66b91..00000000 --- a/solutions/hpc/localtweak_examples/README.md +++ /dev/null @@ -1,10 +0,0 @@ -The files in this directory are not normally included in the Terraform plan. -If you want to enable one of them in your deployment, just copy it to the base directory of the project (one level up from here), -or make a symlink if no customization is needed. Always be sure the new name ends with "*.tf". -For example: - cd solutions/hpc - ln -s localtweak_examples/localtweak___ALLOW_MY_IP_override.tf.txt localtweak___ALLOW_MY_IP_override.tf - -They add resources and/or use the "override" Terraform feature to modify the local deployment in ways that you do not want to commit to the standard code. -(note how the .gitignore file in the root directory mentions "localtweak__*.tf" files to avoid committing them accidentally) -In some cases additional configuration is required on your system (see "*_extras" directories). diff --git a/solutions/hpc/localtweak_examples/localtweak___ALLOW_MY_IP_override.tf.txt b/solutions/hpc/localtweak_examples/localtweak___ALLOW_MY_IP_override.tf.txt deleted file mode 100644 index 63aa9ce1..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___ALLOW_MY_IP_override.tf.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This localtweak will detect your current internet visible address and add it to the -# allowed_ips list, so you are sure you will be able to access the cluster. - -locals { - add_current_ip_to_allowed_cidr = true -} diff --git a/solutions/hpc/localtweak_examples/localtweak___FORCE_VALID_PASSWORD_override.tf.txt b/solutions/hpc/localtweak_examples/localtweak___FORCE_VALID_PASSWORD_override.tf.txt deleted file mode 100644 index 3f9245ba..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___FORCE_VALID_PASSWORD_override.tf.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This localtweak will skip the enforcement of password policies for app center gui, -# so you can avoid involving complex passwords when in a testing environment. - -locals { - validate_app_center_gui_pwd = true -} diff --git a/solutions/hpc/localtweak_examples/localtweak___PRINT_EXTRA_OUTPUTS_override.tf.txt b/solutions/hpc/localtweak_examples/localtweak___PRINT_EXTRA_OUTPUTS_override.tf.txt deleted file mode 100644 index cb2bbd30..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___PRINT_EXTRA_OUTPUTS_override.tf.txt +++ /dev/null @@ -1,5 +0,0 @@ -# This localtweak will enable extra outputs (more debugging). - -locals { - print_extra_outputs = true -} diff --git a/solutions/hpc/localtweak_examples/localtweak___REMOTE_ALLOWED_IPS_EXTRA_override.tf.txt b/solutions/hpc/localtweak_examples/localtweak___REMOTE_ALLOWED_IPS_EXTRA_override.tf.txt deleted file mode 100644 index af110bae..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___REMOTE_ALLOWED_IPS_EXTRA_override.tf.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This localtweak will let you add IPs or networks to the allowed ips list, so you will be able -# to access the cluster from those IPs. - -locals { - remote_allowed_ips_extra = ["192.0.2.0/24"] -} diff --git a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF.tf.txt b/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF.tf.txt deleted file mode 100644 index 16561b5d..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF.tf.txt +++ /dev/null @@ -1,43 +0,0 @@ -# This localtweak will trigger some scripts (have a look in the extras directory too), -# which can fix the ip addresses in your ssh config files, so you will be able to use -# hostname aliases in ssh commands. - -resource "null_resource" "ssh_conf_fip" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = "[ -f ~/.ssh/config.d/hpcaas_ip_addr.sh ] && ~/.ssh/config.d/hpcaas_ip_addr.sh ${var.cluster_prefix} fip ${local.bastion_fip}" - } - triggers = { - build = timestamp() - } -} - -resource "null_resource" "ssh_conf_login" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = "[ -f ~/.ssh/config.d/hpcaas_ip_addr.sh ] && ~/.ssh/config.d/hpcaas_ip_addr.sh ${var.cluster_prefix} login ${local.login_private_ips[0]}" - } - triggers = { - build = timestamp() - } -} - -resource "null_resource" "ssh_conf_mgmt_ip" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = "[ -f ~/.ssh/config.d/hpcaas_ip_addr.sh ] && ~/.ssh/config.d/hpcaas_ip_addr.sh ${var.cluster_prefix} mgmt ${local.management_private_ip}" - } - triggers = { - build = timestamp() - } -} - -resource "null_resource" "ssh_conf_mgmt_candidate_ips" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = "[ -f ~/.ssh/config.d/hpcaas_ip_addr.sh ] && ~/.ssh/config.d/hpcaas_ip_addr.sh ${var.cluster_prefix} mgmt_candidate ${join(",", local.management_candidate_private_ips)}" - } - triggers = { - build = timestamp() - } -} diff --git a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas.conf b/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas.conf deleted file mode 100644 index 3a663146..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas.conf +++ /dev/null @@ -1,33 +0,0 @@ -# This directives assume a cluster prefix of "hpc-cluster-x", you may want to customize that by replacing the target string in the comments. - -### rules for all hosts ### -Host hpcx* - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - -### bastion host ### -Host hpcx - Hostname 150.239.223.53 # hpc-cluster-x fip - User ubuntu - IdentityFile ~/.ssh/keys/myprivatekey - -### login host ### -Host hpcxl - Hostname 10.241.16.5 # hpc-cluster-x iplogin - User lsfadmin - ProxyJump ubuntu@hpcx - -### rules for all management nodes ### -Host hpcx1 hpcx2 hpcx3 - User lsfadmin - ProxyJump ubuntu@hpcx - LocalForward 8443 pac.hpcaas.com:8443 - LocalForward 6080 pac.hpcaas.com:6080 - -### management hosts ### -Host hpcx1 - Hostname 10.241.0.9 # hpc-cluster-x ip1 -Host hpcx2 - Hostname 10.241.0.11 # hpc-cluster-x ip2 -Host hpcx3 - Hostname 10.241.0.10 # hpc-cluster-x ip3 diff --git a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas_ip_addr.sh b/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas_ip_addr.sh deleted file mode 100755 index d5b6e51a..00000000 --- a/solutions/hpc/localtweak_examples/localtweak___SSH_IP_ADDRESSES_IN_CONF_extras/hpcaas_ip_addr.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -prefix="$1" -iptype="$2" -ipvalue="$3" - -if [ "$iptype" == "fip" ]; then - ip="$ipvalue" - sed -i -e "s/ Hostname .* # $prefix fip/ Hostname $ip # $prefix fip/g" ~/.ssh/config.d/hpcaas.conf -elif [ "$iptype" == "login" ]; then - ip="$ipvalue" - sed -i -e "s/ Hostname .* # $prefix iplogin/ Hostname $ip # $prefix iplogin/g" ~/.ssh/config.d/hpcaas.conf -elif [ "$iptype" == "mgmt" ]; then - ip1="$ipvalue" - sed -i -e "s/ Hostname .* # $prefix ip1/ Hostname $ip1 # $prefix ip1/g" ~/.ssh/config.d/hpcaas.conf -elif [ "$iptype" == "mgmt_candidate" ]; then - ips="$ipvalue" - ip2="${ips%,*}" - ip3="${ips#*,}" - sed -i -e "s/ Hostname .* # $prefix ip2/ Hostname $ip2 # $prefix ip2/g" ~/.ssh/config.d/hpcaas.conf - sed -i -e "s/ Hostname .* # $prefix ip3/ Hostname $ip3 # $prefix ip3/g" ~/.ssh/config.d/hpcaas.conf -fi diff --git a/solutions/hpc/main.tf b/solutions/hpc/main.tf deleted file mode 100644 index be5a3c10..00000000 --- a/solutions/hpc/main.tf +++ /dev/null @@ -1,477 +0,0 @@ -module "landing_zone" { - count = var.solution == "lsf" || var.solution == "hpc" ? 1 : 0 - source = "../../modules/landing_zone" - compute_subnets_cidr = var.vpc_cluster_private_subnets_cidr_blocks - cos_instance_name = var.cos_instance_name - enable_atracker = var.observability_atracker_enable && (var.observability_atracker_target_type == "cos") ? true : false - enable_cos_integration = var.enable_cos_integration - cos_expiration_days = var.cos_expiration_days - enable_vpc_flow_logs = var.enable_vpc_flow_logs - enable_vpn = var.vpn_enabled - key_management = var.key_management - kms_instance_name = var.kms_instance_name - kms_key_name = var.kms_key_name - ssh_keys = var.bastion_ssh_keys - bastion_subnets_cidr = var.vpc_cluster_login_private_subnets_cidr_blocks - network_cidr = var.vpc_cidr - prefix = var.cluster_prefix - resource_group = var.existing_resource_group - vpc = var.vpc_name - subnet_id = var.cluster_subnet_ids - login_subnet_id = var.login_subnet_id - zones = var.zones - no_addr_prefix = local.no_addr_prefix - scc_enable = var.scc_enable - skip_flowlogs_s2s_auth_policy = var.skip_flowlogs_s2s_auth_policy - observability_logs_enable = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute || (var.observability_atracker_enable && var.observability_atracker_target_type == "cloudlogs") ? true : false -} - -module "bootstrap" { - count = var.solution == "lsf" || var.solution == "hpc" ? 1 : 0 - source = "./../../modules/bootstrap" - resource_group = local.resource_groups["workload_rg"] - prefix = var.cluster_prefix - vpc_id = local.vpc_id - network_cidr = var.vpc_name != null && length(var.cluster_subnet_ids) > 0 ? local.existing_subnet_cidrs : split(",", var.vpc_cidr) - bastion_subnets = local.bastion_subnets - ssh_keys = local.bastion_ssh_keys - allowed_cidr = local.allowed_cidr - kms_encryption_enabled = local.kms_encryption_enabled - boot_volume_encryption_key = local.boot_volume_encryption_key - existing_kms_instance_guid = local.existing_kms_instance_guid - skip_iam_authorization_policy = var.skip_iam_block_storage_authorization_policy - bastion_instance_name = var.existing_bastion_instance_name - bastion_instance_public_ip = local.bastion_instance_public_ip - bastion_security_group_id = var.existing_bastion_instance_name != null ? var.existing_bastion_security_group_id : null - ldap_server = var.ldap_server -} - -module "generate_db_adminpassword" { - count = var.enable_app_center && var.app_center_high_availability ? 1 : 0 - source = "../../modules/security/password" - length = 15 - special = true - override_special = "-_" - min_numeric = 1 -} - -module "db" { - count = var.enable_app_center && var.app_center_high_availability ? 1 : 0 - source = "../../modules/database/mysql" - resource_group_id = local.resource_groups["service_rg"] - name = "${var.cluster_prefix}-database" - region = data.ibm_is_region.region.name - mysql_version = local.mysql_version - service_endpoints = local.db_service_endpoints - admin_password = "db-${module.generate_db_adminpassword[0].password}" # with a prefix so we start with a letter - members = local.db_template[0] - memory = local.db_template[1] - disks = local.db_template[2] - vcpu = local.db_template[3] - host_flavour = local.db_template[4] -} - -module "ce_project" { - source = "./../../modules/ce_project" - ibmcloud_api_key = var.ibmcloud_api_key - region = data.ibm_is_region.region.name - resource_group_id = local.resource_groups["workload_rg"] - reservation_id = var.reservation_id - solution = var.solution -} - -module "landing_zone_vsi" { - count = var.solution == "lsf" || var.solution == "hpc" ? 1 : 0 - source = "../../modules/landing_zone_vsi" - resource_group = local.resource_groups["workload_rg"] - ibmcloud_api_key = var.ibmcloud_api_key - prefix = var.cluster_prefix - zones = var.zones - vpc_id = local.vpc_id - bastion_fip = local.bastion_fip - bastion_security_group_id = local.bastion_security_group_id - bastion_public_key_content = local.bastion_public_key_content - cluster_user = local.cluster_user - compute_private_key_content = local.compute_private_key_content - bastion_private_key_content = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - compute_subnets = local.compute_subnets - compute_ssh_keys = local.compute_ssh_keys - management_image_name = var.management_image_name - compute_image_name = var.compute_image_name - login_image_name = var.login_image_name - dns_domain_names = var.dns_domain_name - kms_encryption_enabled = local.kms_encryption_enabled - boot_volume_encryption_key = local.boot_volume_encryption_key - share_path = local.share_path - hyperthreading_enabled = var.hyperthreading_enabled - app_center_gui_pwd = var.app_center_gui_pwd - enable_app_center = var.enable_app_center - contract_id = var.reservation_id - cluster_id = var.cluster_name - management_node_count = var.management_node_count - management_node_instance_type = var.management_node_instance_type - file_share = length(local.valid_lsf_shares) > 0 ? module.file_storage.total_mount_paths : module.file_storage.mount_paths_excluding_first - mount_path = var.custom_file_shares - login_node_instance_type = var.login_node_instance_type - bastion_subnets = local.bastion_subnets - bastion_ssh_keys = local.bastion_ssh_keys - enable_ldap = var.enable_ldap - ldap_basedns = var.ldap_basedns - login_private_ips = join("", local.login_private_ips) - ldap_vsi_profile = var.ldap_vsi_profile - ldap_admin_password = var.ldap_admin_password - ldap_user_name = var.ldap_user_name - ldap_user_password = var.ldap_user_password - ldap_server = var.ldap_server - ldap_server_cert = var.ldap_server_cert - ldap_vsi_osimage_name = var.ldap_vsi_osimage_name - ldap_primary_ip = local.ldap_private_ips - app_center_high_availability = var.app_center_high_availability - db_instance_info = var.enable_app_center && var.app_center_high_availability ? module.db[0].db_instance_info : null - db_admin_password = var.enable_app_center && var.app_center_high_availability ? module.db[0].db_admin_password : null - storage_security_group_id = var.storage_security_group_id - observability_monitoring_enable = var.observability_monitoring_enable - observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable - cloud_monitoring_access_key = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_access_key : "" - cloud_monitoring_ingestion_url = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_ingestion_url : "" - cloud_monitoring_prws_key = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_prws_key : "" - cloud_monitoring_prws_url = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_prws_url : "" - bastion_instance_name = var.existing_bastion_instance_name - ce_project_guid = module.ce_project.guid - existing_kms_instance_guid = local.existing_kms_instance_guid - cloud_logs_ingress_private_endpoint = local.cloud_logs_ingress_private_endpoint - observability_logs_enable_for_management = var.observability_logs_enable_for_management - observability_logs_enable_for_compute = var.observability_logs_enable_for_compute - solution = var.solution - worker_node_max_count = var.worker_node_max_count - worker_node_instance_type = var.worker_node_instance_type - enable_dedicated_host = var.enable_dedicated_host - dedicated_host_id = var.enable_dedicated_host && local.total_worker_node_count >= 1 ? module.dedicated_host[0].dedicated_host_id[0] : null - depends_on = [ - module.validate_ldap_server_connection, - module.dedicated_host - ] -} - -module "file_storage" { - source = "../../modules/file_storage" - zone = var.zones[0] # always the first zone - resource_group = local.resource_groups["workload_rg"] - file_shares = local.file_shares - encryption_key_crn = local.boot_volume_encryption_key - security_group_ids = local.compute_security_group_id - subnet_id = local.compute_subnet_id - prefix = var.cluster_prefix - existing_kms_instance_guid = local.existing_kms_instance_guid - skip_iam_share_authorization_policy = var.skip_iam_share_authorization_policy - kms_encryption_enabled = local.kms_encryption_enabled -} - -module "dns" { - source = "./../../modules/dns" - prefix = var.cluster_prefix - resource_group_id = local.resource_groups["service_rg"] - vpc_crn = local.vpc_crn - subnets_crn = local.compute_subnets_crn - dns_instance_id = var.dns_instance_id - dns_custom_resolver_id = var.dns_custom_resolver_id - dns_domain_names = values(var.dns_domain_name) -} - -module "alb" { - source = "./../../modules/alb" - bastion_subnets = local.bastion_subnets - resource_group_id = local.resource_groups["workload_rg"] - prefix = var.cluster_prefix - security_group_ids = concat(local.compute_security_group_id, [local.bastion_security_group_id]) - vsi_ids = local.vsi_management_ids - certificate_instance = var.enable_app_center && var.app_center_high_availability ? var.app_center_existing_certificate_instance : "" - create_load_balancer = !local.alb_created_by_api && var.app_center_high_availability && var.enable_app_center -} - -module "alb_api" { - source = "./../../modules/alb_api" - ibmcloud_api_key = var.ibmcloud_api_key - region = data.ibm_is_region.region.name - bastion_subnets = local.bastion_subnets - resource_group_id = local.resource_groups["workload_rg"] - prefix = var.cluster_prefix - security_group_ids = concat(local.compute_security_group_id, [local.bastion_security_group_id]) - vsi_ips = concat([local.management_private_ip], local.management_candidate_private_ips) - certificate_instance = var.enable_app_center && var.app_center_high_availability ? var.app_center_existing_certificate_instance : "" - create_load_balancer = local.alb_created_by_api && var.app_center_high_availability && var.enable_app_center -} - -################################################### -# DNS Modules to create DNS domains and records -################################################## -module "compute_dns_records" { - source = "./../../modules/dns_record" - dns_instance_id = local.dns_instance_id - dns_zone_id = local.compute_dns_zone_id - dns_records = local.compute_dns_records - dns_domain_names = var.dns_domain_name -} - -module "worker_dns_records" { - count = var.solution == "lsf" ? 1 : 0 - source = "./../../modules/dns_record" - dns_instance_id = local.dns_instance_id - dns_zone_id = local.compute_dns_zone_id - dns_records = local.worker_vsi_dns_records - dns_domain_names = var.dns_domain_name -} - -module "compute_candidate_dns_records" { - source = "./../../modules/dns_record" - dns_instance_id = local.dns_instance_id - dns_zone_id = local.compute_dns_zone_id - dns_records = local.mgmt_candidate_dns_records - dns_domain_names = var.dns_domain_name -} - -module "login_vsi_dns_records" { - source = "./../../modules/dns_record" - dns_instance_id = local.dns_instance_id - dns_zone_id = local.compute_dns_zone_id - dns_records = local.login_vsi_dns_records - dns_domain_names = var.dns_domain_name -} - -module "ldap_vsi_dns_records" { - source = "./../../modules/dns_record" - dns_instance_id = local.dns_instance_id - dns_zone_id = local.compute_dns_zone_id - dns_records = local.ldap_vsi_dns_records - dns_domain_names = var.dns_domain_name -} - -# DNS entry needed to ALB, can be moved in dns_record module for example -resource "ibm_dns_resource_record" "pac_cname" { - count = var.enable_app_center && var.app_center_high_availability ? 1 : 0 - instance_id = local.dns_instance_id - zone_id = local.compute_dns_zone_id - type = "CNAME" - name = "pac" - ttl = 300 - rdata = local.alb_hostname -} - -module "compute_inventory" { - source = "./../../modules/inventory" - hosts = local.compute_hosts - user = local.cluster_user - server_name = "[HPCAASCluster]" - inventory_path = local.compute_inventory_path -} - -module "worker_inventory" { - count = var.solution == "lsf" ? 1 : 0 - source = "./../../modules/inventory" - hosts = local.worker_host - user = local.cluster_user - server_name = "[WorkerServer]" - inventory_path = local.worker_inventory_path -} - - -################################################### -# Creation of inventory files for the automation usage -################################################## -module "bastion_inventory" { - source = "./../../modules/inventory" - hosts = local.bastion_host - user = local.login_user - server_name = "[BastionServer]" - inventory_path = local.bastion_inventory_path -} - -module "login_inventory" { - source = "./../../modules/inventory" - hosts = local.login_host - user = local.cluster_user - server_name = "[LoginServer]" - inventory_path = local.login_inventory_path -} - -module "ldap_inventory" { - source = "./../../modules/inventory" - hosts = local.ldap_host - user = local.ldap_user - server_name = "[LDAPServer]" - inventory_path = local.ldap_inventory_path -} - -################################################### -# REMOTE_EXEC : Remote exec block to perform certain checks on the cluster nodes -################################################## -module "check_cluster_status" { - source = "./../../modules/null/remote_exec" - cluster_host = [local.management_private_ip] #["10.10.10.4"] - cluster_user = local.cluster_user #"root" - cluster_private_key = local.compute_private_key_content - login_host = local.bastion_fip - login_user = "ubuntu" - login_private_key = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - command = ["lshosts -w; lsid || (sleep 5; lsid) || (sleep 15; lsid)"] # we give it more time if not ready - depends_on = [ - module.landing_zone_vsi, # this implies vsi have been configured too - module.bootstrap - ] -} - -module "check_node_status" { - source = "./../../modules/null/remote_exec" - cluster_host = concat(local.management_candidate_private_ips, [local.management_private_ip]) - cluster_user = local.cluster_user - cluster_private_key = local.compute_private_key_content - login_host = local.bastion_fip - login_user = "ubuntu" - login_private_key = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - command = ["systemctl --no-pager -n 5 status lsfd"] - depends_on = [ - module.landing_zone_vsi, - module.bootstrap, - module.check_cluster_status - ] -} - -module "validate_ldap_server_connection" { - source = "./../../modules/null/ldap_remote_exec" - ldap_server = var.ldap_server - enable_ldap = var.enable_ldap - login_private_key = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - login_host = local.bastion_fip - login_user = "ubuntu" - depends_on = [module.bootstrap] -} - -# Module used to destroy the non-essential resources -module "login_fip_removal" { - source = "./../../modules/null/local_exec" - count = var.enable_fip ? 0 : 1 - region = data.ibm_is_region.region.name - ibmcloud_api_key = var.ibmcloud_api_key - trigger_resource_id = local.bastion_fip_id - command = "ibmcloud is ipd ${local.bastion_fip_id} -f" - depends_on = [module.check_cluster_status] -} - -resource "null_resource" "destroy_compute_resources" { - triggers = { - conn_user = local.cluster_user - conn_host = local.management_private_ip - conn_private_key = local.compute_private_key_content - conn_bastion_host = local.bastion_fip - conn_bastion_private_key = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - } - - # only works if fip is enabled & vpn is disabled (conn is must) - count = false && var.enable_fip == true && var.vpn_enabled == false ? 1 : 0 - - connection { - type = "ssh" - host = self.triggers.conn_host - user = self.triggers.conn_user - private_key = self.triggers.conn_private_key - bastion_host = self.triggers.conn_bastion_host - bastion_user = "ubuntu" - bastion_private_key = self.triggers.conn_bastion_private_key - timeout = "60m" - } - - provisioner "remote-exec" { - when = destroy - on_failure = fail - inline = [file("${path.module}/scripts/destroy_script.sh")] - } -} - -######################################################################################################### -# validation_script_executor Module -# -# Purpose: This module is included for testing purposes. -# It provides a conditional mechanism for executing remote scripts on cluster hosts. -# The execution is triggered if the script filenames listed in TF_VALIDATION_SCRIPT_FILES are provided. -# -# Usage: -# - When scripts are listed in TF_VALIDATION_SCRIPT_FILES, the corresponding scripts -# will be executed on the cluster hosts using remote command execution. -# - The conditional nature ensures that scripts are executed only when necessary. -# This can be useful for various validation or maintenance tasks. -######################################################################################################### - -module "validation_script_executor" { - source = "./../../modules/null/remote_exec" - count = var.TF_VALIDATION_SCRIPT_FILES != null && length(var.TF_VALIDATION_SCRIPT_FILES) > 0 ? 1 : 0 - - cluster_host = [local.management_private_ip] - cluster_user = local.cluster_user - cluster_private_key = local.compute_private_key_content - login_host = local.bastion_fip - login_user = "ubuntu" - login_private_key = local.bastion_ssh_private_key != null ? local.bastion_ssh_private_key : local.bastion_private_key_content - - command = [ - for script_name in var.TF_VALIDATION_SCRIPT_FILES : - file("${path.module}/examples/scripts/${script_name}") - ] - depends_on = [ - module.landing_zone_vsi, - module.bootstrap, - module.check_cluster_status - ] -} - -################################################### -# Observability Modules -################################################### - -module "cloud_monitoring_instance_creation" { - source = "../../modules/observability_instance" - location = local.region - rg = local.resource_groups["service_rg"] - cloud_monitoring_provision = var.observability_monitoring_enable - observability_monitoring_plan = var.observability_monitoring_plan - enable_metrics_routing = var.observability_enable_metrics_routing - enable_platform_logs = var.observability_enable_platform_logs - cluster_prefix = var.cluster_prefix - cloud_monitoring_instance_name = "${var.cluster_prefix}-metrics" - cloud_logs_provision = var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute ? true : false - cloud_logs_instance_name = "${var.cluster_prefix}-cloud-logs" - cloud_logs_retention_period = var.observability_logs_retention_period - cloud_logs_as_atracker_target = var.observability_atracker_enable && (var.observability_atracker_target_type == "cloudlogs") ? true : false - cloud_logs_data_bucket = length([for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "logs-data-bucket")]) > 0 ? [for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "logs-data-bucket")][0] : null - cloud_metrics_data_bucket = length([for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "metrics-data-bucket")]) > 0 ? [for bucket in local.cos_data : bucket if strcontains(bucket.bucket_name, "metrics-data-bucket")][0] : null - tags = ["lsf", var.cluster_prefix] -} - -# Code for SCC Instance -module "scc_instance_and_profile" { - count = var.scc_enable ? 1 : 0 - source = "./../../modules/security/scc" - location = var.scc_location != "" ? var.scc_location : "us-south" - rg = local.resource_groups["service_rg"] - scc_profile = var.scc_enable ? var.scc_profile : "" - event_notification_plan = var.scc_event_notification_plan - tags = ["lsf", var.cluster_prefix] - prefix = var.cluster_prefix - cos_bucket = [for name in module.landing_zone[0].cos_buckets_names : name if strcontains(name, "scc-bucket")][0] - cos_instance_crn = module.landing_zone[0].cos_instance_crns[0] -} - -module "my_ip" { - source = "../../modules/my_ip" -} - -module "dedicated_host" { - count = var.enable_dedicated_host && local.total_worker_node_count >= 1 ? 1 : 0 - source = "../../modules/dedicated_host" - prefix = var.cluster_prefix - zone = var.zones - existing_host_group = false - class = local.dh_profile.class - profile = local.dh_profile.name - family = local.dh_profile.family - resource_group_id = local.resource_groups["workload_rg"] -} diff --git a/solutions/hpc/outputs.tf b/solutions/hpc/outputs.tf deleted file mode 100644 index 7171f2ea..00000000 --- a/solutions/hpc/outputs.tf +++ /dev/null @@ -1,104 +0,0 @@ -output "region_name" { - description = "The region name in which the cluster resources have been deployed" - value = data.ibm_is_region.region.name -} - -output "image_entry_found" { - description = "Available if the image name provided is located within the image map" - value = module.landing_zone_vsi[0].image_map_entry_found -} - -output "vpc_name" { - description = "The VPC name in which the cluster resources have been deployed" - value = "${data.ibm_is_vpc.vpc.name} -- - ${data.ibm_is_vpc.vpc.id}" -} - -output "ssh_to_management_node_1" { - description = "SSH command to connect to HPC cluster" - value = local.bastion_instance_public_ip != null ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${local.bastion_instance_public_ip} lsfadmin@${local.compute_hosts[0]}" : var.enable_fip ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${module.bootstrap[0].bastion_fip[0]} lsfadmin@${local.compute_hosts[0]}" : "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${module.bootstrap[0].bastion_primary_ip} lsfadmin@${local.compute_hosts[0]}" -} - -output "ssh_to_ldap_node" { - description = "SSH command to connect to LDAP node" - value = var.enable_ldap && var.ldap_server == "null" ? (var.enable_fip ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -J ubuntu@${module.bootstrap[0].bastion_fip[0]} ubuntu@${module.landing_zone_vsi[0].ldap_server}" : "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${module.bootstrap[0].bastion_primary_ip} ubuntu@${module.landing_zone_vsi[0].ldap_server}") : null -} - -output "ssh_to_login_node" { - description = "SSH command to connect to Login node" - value = local.bastion_instance_public_ip != null ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${local.bastion_instance_public_ip} lsfadmin@${join(",", local.login_private_ips)}" : var.enable_fip ? "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${module.bootstrap[0].bastion_fip[0]} lsfadmin@${join(",", local.login_private_ips)}" : "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J ubuntu@${module.bootstrap[0].bastion_primary_ip} lsfadmin@${join(",", local.login_private_ips)}" -} - -output "application_center_tunnel" { - description = "Available if IBM Spectrum LSF Application Center GUI is installed" - value = var.enable_app_center ? local.ssh_cmd : null -} - -output "application_center_url" { - description = "Available if IBM Spectrum LSF Application Center GUI is installed" - value = var.enable_app_center ? var.app_center_high_availability ? "https://pac.${var.dns_domain_name.compute}:8443" : "https://localhost:8443" : null -} - -output "application_center_url_note" { - description = "Available if IBM Spectrum LSF Application Center GUI is installed in High Availability" - value = var.enable_app_center && var.app_center_high_availability ? "you may need '127.0.0.1 pac pac.${var.dns_domain_name.compute}' on /etc/hosts of your local machine where the connection is established, to let your browser use the ssh tunnel" : null -} - -output "remote_allowed_cidr" { - description = "The following IPs/networks are allow-listed for incoming connections" - value = local.allowed_cidr -} - -output "management_hostname" { - description = "Management node has this hostname:" - value = local.print_extra_outputs ? local.management_hostname : null -} - -output "management_ip" { - description = "Management node has this IP:" - value = local.print_extra_outputs ? local.management_private_ip : null -} - -output "management_candidate_hostnames" { - description = "Management candidate nodes have these hostnames:" - value = local.print_extra_outputs ? local.management_candidate_hostnames : null -} - -output "management_candidate_ips" { - description = "Management candidate nodes have these IPs:" - value = local.print_extra_outputs ? local.management_candidate_private_ips : null -} - -output "login_hostnames" { - description = "Login nodes have these hostnames:" - value = local.print_extra_outputs ? local.login_hostnames : null -} - -output "login_ips" { - description = "Login nodes have these IPs:" - value = local.print_extra_outputs ? local.login_private_ips : null -} - -output "ldap_hostnames" { - description = "LDAP nodes have these hostnames:" - value = local.print_extra_outputs ? local.ldap_hostnames : null -} - -output "ldap_ips" { - description = "LDAP nodes have these IPs:" - value = local.print_extra_outputs ? local.ldap_private_ips : null -} - -output "cloud_monitoring_url" { - value = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_url : null - description = "IBM Cloud Monitoring URL" -} - -output "cloud_logs_url" { - value = (var.observability_logs_enable_for_management || var.observability_logs_enable_for_compute) ? module.cloud_monitoring_instance_creation.cloud_logs_url : null - description = "IBM Cloud Logs URL" -} - -output "worker_node_min_count" { - description = "Provides the total number of count for the static worker node." - value = local.total_worker_node_count -} diff --git a/solutions/hpc/scripts/destroy_script.sh b/solutions/hpc/scripts/destroy_script.sh deleted file mode 100644 index 01db6abe..00000000 --- a/solutions/hpc/scripts/destroy_script.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -badmin qclose all -bkill -u all -while true; do - if (bhosts -o status) | grep ok; then - sleep 1m - else - sleep 2m - exit 0 - fi -done diff --git a/solutions/hpc/variables.tf b/solutions/hpc/variables.tf deleted file mode 100644 index ff78886f..00000000 --- a/solutions/hpc/variables.tf +++ /dev/null @@ -1,680 +0,0 @@ -############################################################################## -# Account Variables -############################################################################## - -variable "ibmcloud_api_key" { - description = "IBM Cloud API key for the IBM Cloud account where the IBM Spectrum LSF cluster needs to be deployed. For more information on how to create an API key, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." - type = string - sensitive = true - validation { - condition = var.ibmcloud_api_key != "" - error_message = "The API key for IBM Cloud must be set." - } -} - -############################################################################## -# Resource Groups Variables -############################################################################## - -variable "existing_resource_group" { - description = "Specify the name of the existing resource group in your IBM Cloud account where VPC resources will be deployed. By default, the resource group is set to 'Default.' In some older accounts, it may be 'default,' so please verify the resource group name before proceeding. If the value is set to \"null\", the automation will create two separate resource groups: 'workload-rg' and 'service-rg.' For more details, see Managing resource groups." - type = string - default = "Default" - validation { - condition = var.existing_resource_group != null - error_message = "If you want to provide null for resource_group variable, it should be within double quotes." - } -} - -############################################################################## -# Module Level Variables -############################################################################## - -variable "cluster_prefix" { - description = "The prefix is used to name the IBM Cloud LSF cluster and the resources provisioned to build the cluster instance. Each Spectrum LSF cluster must have a unique name, so ensure the prefix is distinct. It must begin with a lowercase letter and can only include lowercase letters, digits, and hyphens. Hyphens must be followed by a lowercase letter or digit, with no leading, trailing, or consecutive hyphens. The prefix length must be less than 16 characters." - type = string - default = "hpc-lsf" - - validation { - error_message = "Prefix must start with a lowercase letter and contain only lowercase letters, digits, and hyphens in between. Hyphens must be followed by at least one lowercase letter or digit. There are no leading, trailing, or consecutive hyphens." - condition = can(regex("^[a-z](?:[a-z0-9]*(-[a-z0-9]+)*)?$", var.cluster_prefix)) - } - validation { - condition = length(var.cluster_prefix) <= 16 - error_message = "The cluster_prefix must be 16 characters or fewer." - } -} - -variable "solution" { - type = string - default = "lsf" - description = "Provide the value for the solution that is needed for the support of lsf and HPC" - validation { - condition = contains(["hpc", "lsf"], var.solution) - error_message = "supported values are only lsf for BYOL and HPC" - } -} - -variable "zones" { - description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." - type = list(string) - default = ["us-east-1"] - validation { - condition = length(var.zones) == 1 - error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." - } -} - -variable "cluster_name" { - type = string - description = "Provide a unique cluster name that LSF uses to configure and group the cluster. Without this name, LSF cannot form a cluster, and the initial deployments will fail. The cluster name can be up to 39 alphanumeric characters and may include underscores (_), hyphens (-), and periods (.). Spaces and other special characters are not allowed. Avoid using the name of any host or user as the cluster name. Note that the cluster name cannot be changed after deployment." - validation { - condition = 0 < length(var.cluster_name) && length(var.cluster_name) < 40 && can(regex("^[a-zA-Z0-9_.-]+$", var.cluster_name)) - error_message = "The Cluster name can be up to 39 alphanumeric characters including the underscore (_), the hyphen (-), and the period (.) characters. Other special characters and spaces are not allowed." - } -} - -variable "reservation_id" { - type = string - sensitive = true - default = null - description = "Ensure that you have received the reservation ID from IBM technical sales. Reservation ID is a unique identifier to distinguish different IBM Cloud HPC service agreements. It must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (_)." -} - -############################################################################## -# VPC Variables -############################################################################## - -variable "vpc_name" { - type = string - description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" - default = null -} - -variable "cluster_subnet_ids" { - type = list(string) - default = [] - description = "Provide the list of existing subnet ID under the existing VPC where the cluster will be provisioned. One subnet ID is required as input value. The management nodes, file storage shares, and compute nodes will be deployed in the same zone." - validation { - condition = contains([0, 1], length(var.cluster_subnet_ids)) - error_message = "The subnet_id value should either be empty or contain exactly one element. Provide only a single subnet value from the supported zones." - } -} - -variable "login_subnet_id" { - type = string - default = null - description = "Provide the list of existing subnet ID under the existing VPC, where the login/bastion server will be provisioned. One subnet id is required as input value for the creation of login node and bastion in the same zone as the management nodes. Note: Provide a different subnet id for login_subnet_id, do not overlap or provide the same subnet id that was already provided for cluster_subnet_ids." -} - -variable "vpc_cidr" { - description = "Creates the address prefix for the new VPC, when the vpc_name variable is empty. The VPC requires an address prefix for creation of subnet in a single zone. The subnet are created with the specified CIDR blocks. For more information, see [Setting IP ranges](https://cloud.ibm.com/docs/vpc?topic=vpc-vpc-addressing-plan-design)." - type = string - default = "10.241.0.0/18" -} - -variable "vpc_cluster_private_subnets_cidr_blocks" { - type = list(string) - default = ["10.241.0.0/20"] - description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." - validation { - condition = length(var.vpc_cluster_private_subnets_cidr_blocks) == 1 - error_message = "Single zone is supported to deploy resources. Provide a CIDR range of subnets creation." - } -} - -variable "vpc_cluster_login_private_subnets_cidr_blocks" { - type = list(string) - default = ["10.241.16.0/28"] - description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." - validation { - condition = length(var.vpc_cluster_login_private_subnets_cidr_blocks) <= 1 - error_message = "Only a single zone is supported to deploy resources. Provide a CIDR range of subnet creation." - } - validation { - condition = tonumber(regex("/(\\d+)", join(",", var.vpc_cluster_login_private_subnets_cidr_blocks))[0]) <= 28 - error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." - } -} - -############################################################################## -# Access Variables -############################################################################## - -variable "remote_allowed_ips" { - type = list(string) - description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." - validation { - condition = alltrue([ - for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) - ]) - error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." - } - validation { - condition = alltrue([ - for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) - ]) - error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." - } -} - -############################################################################## -# Compute Variables -############################################################################## - -variable "bastion_ssh_keys" { - type = list(string) - description = "Provide the list of SSH key names configured in your IBM Cloud account to establish a connection to the Spectrum LSF bastion and login node. Make sure the SSH key exists in the same resource group and region where the cluster is being provisioned. To pass multiple SSH keys, use the format [\"key-name-1\", \"key-name-2\"]. If you don't have an SSH key in your IBM Cloud account, you can create one by following the provided .[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys)." -} - -variable "compute_ssh_keys" { - type = list(string) - description = "Provide the list of SSH key names configured in your IBM Cloud account to establish a connection to the Spectrum LSF cluster node. Ensure the SSH key is present in the same resource group and region where the cluster is being provisioned. To pass multiple SSH keys, use the format [\"key-name-1\", \"key-name-2\"]. If you do not have an SSH key in your IBM Cloud account, create one by following the provided instructions.[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys).." -} - -variable "login_node_instance_type" { - type = string - default = "bx2-2x8" - description = "Specify the virtual server instance profile type to be used to create the login node for the IBM Spectrum LSF cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - validation { - condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.login_node_instance_type)) - error_message = "The profile must be a valid profile name." - } -} - -variable "management_image_name" { - type = string - default = "hpc-lsf10-rhel810-v2" - description = "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster management nodes. By default, the solution uses a RHEL810 base image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." -} - -variable "compute_image_name" { - type = string - default = "hpcaas-lsf10-rhel810-compute-v8" - description = "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster compute (static/dynamic) nodes. By default, the solution uses a RHEL 8-10 base OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." -} - -variable "login_image_name" { - type = string - default = "hpcaas-lsf10-rhel810-compute-v8" - description = "Name of the custom image that you want to use to create virtual server instances in your IBM Cloud account to deploy the IBM Cloud Spectrum LSF cluster login node. By default, the solution uses a RHEL 8-10 OS image with additional software packages mentioned [here](https://cloud.ibm.com/docs/ibm-spectrum-lsf#create-custom-image). If you would like to include your application-specific binary files, follow the instructions in [ Planning for custom images ](https://cloud.ibm.com/docs/vpc?topic=vpc-planning-custom-images) to create your own custom image and use that to build the lsf cluster through this offering." -} - -variable "management_node_instance_type" { - type = string - default = "bx2-16x64" - description = "Specify the virtual server instance profile type to be used to create the management nodes for the IBM Cloud LSF cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - validation { - condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.management_node_instance_type)) - error_message = "The profile must be a valid profile name." - } -} - -variable "management_node_count" { - type = number - default = 2 - description = "Specify the total number of management nodes, with a value between 1 and 10." - validation { - condition = 1 <= var.management_node_count && var.management_node_count <= 10 - error_message = "Input \"management_node_count\" must be must be greater than or equal to 1 and less than or equal to 10." - } -} - -variable "worker_node_instance_type" { - type = list(object({ - count = number - instance_type = string - })) - description = "The minimum number of worker nodes represents the static nodes provisioned during cluster creation. The solution supports different instance types, so specify the node count based on the requirements for each instance profile. For dynamic node provisioning, the automation will select the first profile from the list. Ensure sufficient account-level capacity if specifying a higher instance profile.. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." - default = [ - { - count = 0 - instance_type = "bx2-4x16" - }, - { - count = 0 - instance_type = "cx2-8x16" - } - ] -} - -variable "worker_node_max_count" { - type = number - default = 10 - description = "The maximum number of worker nodes that can be deployed in the Spectrum LSF cluster. In order to use the [Resource Connector](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=lsf-resource-connnector) feature to dynamically create and delete worker nodes based on workload demand, the value selected for this parameter must be larger than the total count of worker_node_instance_type. If you plan to deploy only static worker nodes in the LSF cluster." - validation { - condition = 1 <= var.worker_node_max_count && var.worker_node_max_count <= 500 - error_message = "Input \"worker_node_max_count must\" be >= 1 and <= 500." - } -} - -variable "custom_file_shares" { - type = list(object({ - mount_path = string, - size = optional(number), - iops = optional(number), - nfs_share = optional(string) - })) - default = [{ mount_path = "/mnt/vpcstorage/tools", size = 100, iops = 2000 }, { mount_path = "/mnt/vpcstorage/data", size = 100, iops = 6000 }, { mount_path = "/mnt/scale/tools", nfs_share = "" }] - description = "Provide details for customizing your shared file storage layout, including mount points, sizes (in GB), and IOPS ranges for up to five file shares if using VPC file storage as the storage option.If using IBM Storage Scale as an NFS mount, update the appropriate mount path and nfs_share values created from the Storage Scale cluster. Note that VPC file storage supports attachment to a maximum of 256 nodes. Exceeding this limit may result in mount point failures due to attachment restrictions.For more information, see [Storage options](https://test.cloud.ibm.com/docs/hpc-ibm-spectrumlsf?topic=hpc-ibm-spectrumlsf-integrating-scale#integrate-scale-and-hpc)." - validation { - condition = length([for item in var.custom_file_shares : item if item.nfs_share == null]) <= 5 - error_message = "The VPC storage custom file share count \"custom_file_shares\" must be less than or equal to 5. Unlimited NFS mounts are allowed." - } - validation { - condition = length([for mounts in var.custom_file_shares : mounts.mount_path]) == length(toset([for mounts in var.custom_file_shares : mounts.mount_path])) - error_message = "Mount path values should not be duplicated." - } - validation { - condition = alltrue([for mounts in var.custom_file_shares : can(mounts.size) && mounts.size != null ? (10 <= mounts.size && mounts.size <= 32000) : true]) - error_message = "The custom_file_share size must be greater than or equal to 10 and less than or equal to 32000." - } -} - -variable "storage_security_group_id" { - type = string - default = null - description = "Provide the storage security group ID from the Spectrum Scale storage cluster if the mount_path in the cluster_file_share variable is set to use Scale fileset mount points. This security group is essential for establishing connections between the Spectrum LSF cluster nodes and NFS mount points, ensuring the nodes can access the specified mount points." -} - -############################################################################## -# DNS Template Variables -############################################################################## - -variable "dns_instance_id" { - type = string - default = null - description = "Provide the ID of an existing IBM Cloud DNS service instance to avoid creating a new one. Note: If dns_instance_id is not set to null, a new DNS zone will be created within the specified DNS service instance." -} - -variable "dns_domain_name" { - type = object({ - compute = string - }) - default = { - compute = "lsf.com" - } - description = "IBM Cloud DNS Services domain name to be used for the IBM Spectrum LSF cluster." - validation { - condition = can(regex("^([a-zA-Z0-9][a-zA-Z0-9-]{0,61}[a-zA-Z0-9])\\.com$", var.dns_domain_name.compute)) - error_message = "The domain name provided for compute is not a fully qualified domain name (FQDN). An FQDN can contain letters (a-z, A-Z), digits (0-9), hyphens (-), dots (.), and must start and end with an alphanumeric character." - } -} - -variable "dns_custom_resolver_id" { - type = string - default = null - description = "Provide the id of existing IBM Cloud DNS custom resolver to skip creating a new custom resolver. If the value is set to null, a new dns custom resolver shall be created and associated to the vpc. Note: A VPC can be associated only to a single custom resolver, please provide the id of custom resolver if it is already associated to the VPC." -} - -############################################################################## -# Observability Variables -############################################################################## - -variable "enable_cos_integration" { - type = bool - default = false - description = "Set to true to create an extra cos bucket to integrate with HPC cluster deployment." -} - -variable "cos_instance_name" { - type = string - default = null - description = "Provide the name of the existing COS instance where the logs for the enabled functionalities will be stored." -} - -variable "observability_atracker_enable" { - type = bool - default = true - description = "Configures Activity Tracker Event Routing to determine how audit events routed. While multiple Activity Tracker Event Routing can be created, only one is needed to capture all events. If an existing Activity Tracker is already integrated with a COS bucket or IBM Cloud Logs instance, set this value to false to avoid creating redundant trackers. All events can then be monitored and accessed through the existing tracker." -} - -variable "observability_atracker_target_type" { - type = string - default = "cloudlogs" - description = "Determines where all events can be stored based on the user input. Select the desired target type to retrieve or capture events into your system." - validation { - condition = contains(["cloudlogs", "cos"], var.observability_atracker_target_type) - error_message = "Allowed values for atracker target type is cloudlogs and cos." - } -} - -variable "cos_expiration_days" { - type = number - default = 30 - description = "Specify the retention period for objects in COS buckets by setting the number of days after their creation for automatic expiration. This configuration helps manage storage efficiently by removing outdated or unnecessary data, reducing storage costs, and maintaining data lifecycle policies. Ensure that the specified duration aligns with your data retention and compliance requirements." -} - -variable "enable_vpc_flow_logs" { - type = bool - default = true - description = "This flag determines whether VPC flow logs are enabled. When set to true, a flow log collector will be created to capture and monitor network traffic data within the VPC. Enabling flow logs provides valuable insights for troubleshooting, performance monitoring, and security auditing by recording information about the traffic passing through your VPC. Consider enabling this feature to enhance visibility and maintain robust network management practices." -} - -variable "vpn_enabled" { - type = bool - default = false - description = "Set the value as true to deploy a VPN gateway for VPC in the cluster." -} - -variable "observability_monitoring_enable" { - description = "Set this value as false to disable the IBM Cloud Monitoring integration. If enabled, infrastructure and LSF application metrics only from management nodes will be captured." - type = bool - default = true -} - -variable "observability_logs_enable_for_management" { - description = "Set this value as false to disable the IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from management nodes will be captured." - type = bool - default = false -} - -variable "observability_logs_enable_for_compute" { - description = "Set this value as false to disables the IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from compute nodes (static nodes or worker nodes) will be captured." - type = bool - default = false -} - -variable "observability_enable_platform_logs" { - description = "Setting this value as true creates a tenant in the same region in which the IBM® Cloud Logs instance is provisioned to enable platform logs for that region. NOTE: You can only have 1 tenant per region in an account." - type = bool - default = false -} - -variable "observability_enable_metrics_routing" { - description = "Enable the metrics routing to manage metrics at the account level by configuring targets and routes that define how the data points are routed." - type = bool - default = false -} - -variable "observability_logs_retention_period" { - description = "The number of days IBM Cloud Logs retains the logs data in priority insights. By default the value is set as 7, but the allowed values are 14, 30, 60, and 90." - type = number - default = 7 - validation { - condition = contains([7, 14, 30, 60, 90], var.observability_logs_retention_period) - error_message = "Allowed values for cloud logs retention period is 7, 14, 30, 60, 90." - } -} - -variable "observability_monitoring_on_compute_nodes_enable" { - description = "Set this value as false to disable IBM Cloud Monitoring integration. If enabled, infrastructure metrics from both static and dynamic compute nodes will be captured." - type = bool - default = false -} - -variable "observability_monitoring_plan" { - description = "This is a type of service plan for IBM Cloud Monitoring instance. You can choose one of the following: lite or graduated-tier. For all details visit [IBM Cloud Monitoring Service Plans](https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans)." - type = string - default = "graduated-tier" - validation { - condition = can(regex("lite|graduated-tier", var.observability_monitoring_plan)) - error_message = "Please enter a valid plan for IBM Cloud Monitoring, for all details visit https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans." - } -} - -############################################################################## -# Encryption Variables -############################################################################## - -variable "key_management" { - type = string - default = "key_protect" - description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." - validation { - condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" - error_message = "key_management must be either 'null' or 'key_protect'." - } -} - -variable "kms_instance_name" { - type = string - default = null - description = "Provide the name of the existing Key Protect instance associated with the Key Management Service. Note: To use existing kms_instance_name set key_management as key_protect. The name can be found under the details of the KMS, see [View key-protect ID](https://cloud.ibm.com/docs/key-protect?topic=key-protect-retrieve-instance-ID&interface=ui)." -} - -variable "kms_key_name" { - type = string - default = null - description = "Provide the existing kms key name that you want to use for the IBM Spectrum LSF cluster. Note: kms_key_name to be considered only if key_management value is set as key_protect.(for example kms_key_name: my-encryption-key)." -} - -############################################################################## -# SCC Variables -############################################################################## - -variable "scc_enable" { - type = bool - default = true - description = "Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created." -} - -variable "scc_profile" { - type = string - default = "CIS IBM Cloud Foundations Benchmark v1.1.0" - description = "Profile to be set on the SCC Instance (accepting empty, 'CIS IBM Cloud Foundations Benchmark v1.1.0' and 'IBM Cloud Framework for Financial Services')" - validation { - condition = can(regex("^(|CIS IBM Cloud Foundations Benchmark v1.1.0|IBM Cloud Framework for Financial Services)$", var.scc_profile)) - error_message = "Provide SCC Profile Name to be used (accepting empty, 'CIS IBM Cloud Foundations Benchmark' and 'IBM Cloud Framework for Financial Services')." - } -} - -variable "scc_location" { - description = "Location where the SCC instance is provisioned (possible choices 'us-south', 'eu-de', 'ca-tor', 'eu-es')" - type = string - default = "us-south" - validation { - condition = can(regex("^(|us-south|eu-de|ca-tor|eu-es)$", var.scc_location)) - error_message = "Provide region where it's possible to deploy an SCC Instance (possible choices 'us-south', 'eu-de', 'ca-tor', 'eu-es') or leave blank and it will default to 'us-south'." - } -} - -variable "scc_event_notification_plan" { - type = string - default = "lite" - description = "Event Notifications Instance plan to be used (it's used with S.C.C. instance), possible values 'lite' and 'standard'." - validation { - condition = can(regex("^(|lite|standard)$", var.scc_event_notification_plan)) - error_message = "Provide Event Notification instance plan to be used (accepting 'lite' and 'standard', defaulting to 'lite'). This instance is used in conjuction with S.C.C. one." - } -} - -############################################################################## -# Hyper-Threading in Compute Nodes -############################################################################## - -variable "hyperthreading_enabled" { - type = bool - default = true - description = "Enabling this setting (true by default) allows hyper-threading on the nodes of the cluster, improving overall processing efficiency by permitting each CPU core to execute multiple threads simultaneously. If set to false, hyperthreading will be disabled, which may be preferable for certain workloads requiring dedicated, non-threaded CPU resources for optimal performance. Carefully consider the nature of your computational tasks when configuring this option to achieve the best balance between performance and resource utilization." -} - -############################################################################## -# Encryption Variables -############################################################################## -variable "enable_app_center" { - type = bool - default = false - description = "Set to true to enable the IBM Spectrum LSF Application Center GUI (default: false). [System requirements](https://www.ibm.com/docs/en/slac/10.2.0?topic=requirements-system-102-fix-pack-14) for IBM Spectrum LSF Application Center Version 10.2 Fix Pack 14." -} - -variable "app_center_gui_pwd" { - type = string - sensitive = true - default = "" - description = "Password for IBM Spectrum LSF Application Center GUI. Note: Password should be at least 8 characters, must have one number, one lowercase letter, one uppercase letter, and at least one special character." -} - -variable "app_center_high_availability" { - type = bool - default = false - description = "Set to false to disable the IBM Spectrum LSF Application Center GUI High Availability (default: true). If the value is set as true, provide a certificate instance crn under existing_certificate_instance value for the VPC load balancer to enable HTTPS connections.For more information see [certificate instance requirements](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-before-deploy-application-center)." -} - -variable "enable_fip" { - type = bool - default = true - description = "The solution supports multiple ways to connect to your IBM Spectrum LSF cluster. For example, using a login node, or using VPN or direct connection. If connecting to the lsf cluster using VPN or direct connection, set this value to false." -} - -############################################################################## -# ldap Variables -############################################################################## -variable "enable_ldap" { - type = bool - default = false - description = "Set this option to true to enable LDAP for IBM Spectrum LSF, with the default value set to false." -} - -variable "ldap_basedns" { - type = string - default = "lsf.com" - description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." -} - -variable "ldap_server" { - type = string - default = "null" - description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." -} - -variable "ldap_server_cert" { - type = string - sensitive = true - default = "null" - description = "Provide the existing LDAP server certificate. This value is required if the 'ldap_server' variable is not set to null. If the certificate is not provided or is invalid, the LDAP configuration may fail. For more information on how to create or obtain the certificate, please refer [existing LDAP server certificate](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-integrating-openldap)." -} - -variable "ldap_admin_password" { - type = string - sensitive = true - default = "" - description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required. It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." -} - -variable "ldap_user_name" { - type = string - default = "" - description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server]" -} - -variable "ldap_user_password" { - type = string - sensitive = true - default = "" - description = "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." -} - -variable "ldap_vsi_profile" { - type = string - default = "cx2-2x4" - description = "Specify the virtual server instance profile type to be used to create the ldap node for the IBM Spectrum LSF cluster. For choices on profile types, see [Instance profiles](https://cloud.ibm.com/docs/vpc?topic=vpc-profiles)." -} - -variable "ldap_vsi_osimage_name" { - type = string - default = "ibm-ubuntu-22-04-4-minimal-amd64-3" - description = "Image name to be used for provisioning the LDAP instances. By default ldap server are created on Ubuntu based OS flavour." -} - -variable "skip_iam_block_storage_authorization_policy" { - type = bool - default = false - description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the block storage volume. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." -} - -variable "skip_iam_share_authorization_policy" { - type = bool - default = false - description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the VPC file share. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." -} - -variable "skip_flowlogs_s2s_auth_policy" { - type = bool - default = false - description = "When using an existing COS instance, set this value to true if authorization is already enabled between COS instance and the flow logs service. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment." -} - -########################################################################### -# IBM Cloud ALB Variables -########################################################################### -variable "app_center_existing_certificate_instance" { - description = "When app_center_high_availability is enable/set as true, The Application Center will be configured for high availability and requires a Application Load Balancer Front End listener to use a certificate CRN value stored in the Secret Manager. Provide the valid 'existing_certificate_instance' to configure the Application load balancer." - type = string - default = "" -} - -############################################################################## -# Environment Variables -############################################################################## - -# tflint-ignore: all -variable "TF_VERSION" { - type = string - default = "1.9" - description = "The version of the Terraform engine that's used in the Schematics workspace." -} - -# tflint-ignore: all -variable "TF_PARALLELISM" { - type = string - default = "250" - description = "Parallelism/ concurrent operations limit. Valid values are between 1 and 256, both inclusive. [Learn more](https://www.terraform.io/docs/internals/graph.html#walking-the-graph)." - validation { - condition = 1 <= var.TF_PARALLELISM && var.TF_PARALLELISM <= 256 - error_message = "Input \"TF_PARALLELISM\" must be greater than or equal to 1 and less than or equal to 256." - } -} - -# tflint-ignore: terraform_naming_convention -variable "TF_VALIDATION_SCRIPT_FILES" { - type = list(string) - default = [] - description = "List of script file names used by validation test suites. If provided, these scripts will be executed as part of validation test suites execution." - validation { - condition = alltrue([for filename in var.TF_VALIDATION_SCRIPT_FILES : can(regex(".*\\.sh$", filename))]) - error_message = "All validation script file names must end with .sh." - } -} -########################################################################### -# Existing Bastion Support variables -########################################################################### - -variable "existing_bastion_instance_name" { - type = string - default = null - description = "Provide the name of the bastion instance. If none given then new bastion will be created." -} - -variable "existing_bastion_instance_public_ip" { - type = string - default = null - description = "Provide the public ip address of the bastion instance to establish the remote connection." -} - -variable "existing_bastion_security_group_id" { - type = string - default = null - description = "Specify the security group ID for the bastion server. This ID will be added as an allowlist rule on the HPC cluster nodes to facilitate secure SSH connections through the bastion node. By restricting access through a bastion server, this setup enhances security by controlling and monitoring entry points into the cluster environment. Ensure that the specified security group is correctly configured to permit only authorized traffic for secure and efficient management of cluster resources." -} - -variable "existing_bastion_ssh_private_key" { - type = string - sensitive = true - default = null - description = "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." -} - -########################################################################### -# Dedicated Host variables -########################################################################### - -variable "enable_dedicated_host" { - type = bool - default = false - description = "Set this option to true to enable dedicated hosts for the VSI created for workload servers. The default value is false. When a dedicated host is enabled, the solution supports only static worker nodes with a single profile, and multiple profile combinations are not supported. For example, you can select a profile from a single family, such as bx2, cx2, or mx2. If you are provisioning a static cluster with a third-generation profile, ensure that dedicated hosts are supported in the chosen regions, as not all regions support dedicated hosts for third-gen profiles. To learn more about dedicated host, [click here.](https://cloud.ibm.com/docs/vpc?topic=vpc-dh-profiles&interface=ui)" - validation { - condition = !(var.enable_dedicated_host && length(var.worker_node_instance_type) != 1) - error_message = "When 'enable_dedicated_host' is true, only one profile should be specified in 'worker_node_instance_type'." - } -} diff --git a/solutions/hpc/version.tf b/solutions/hpc/version.tf deleted file mode 100644 index 20309893..00000000 --- a/solutions/hpc/version.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - required_version = ">= 1.9.0" - required_providers { - ibm = { - source = "IBM-Cloud/ibm" - version = "1.77.0" - } - null = { - source = "hashicorp/null" - version = "3.2.3" - } - http = { - source = "hashicorp/http" - version = "3.4.5" - } - } -} diff --git a/solutions/hpcaas/README.md b/solutions/hpcaas/README.md new file mode 100644 index 00000000..aa48be02 --- /dev/null +++ b/solutions/hpcaas/README.md @@ -0,0 +1,79 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [ibm](#requirement\_ibm) | >= 1.68.1, < 2.0.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [hpcaas](#module\_hpcaas) | ./../.. | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` | n/a | yes | +| [bastion\_ssh\_keys](#input\_bastion\_ssh\_keys) | The key pair to use to access the bastion host. | `list(string)` | `null` | no | +| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `string` | `"10.0.0.0/24"` | no | +| [client\_instances](#input\_client\_instances) | Number of instances to be launched for client. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to launch the client host. | `list(string)` | `null` | no | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `string` | `"10.10.10.0/24"` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | `null` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `string` | `"10.10.20.0/24"` | no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [deployer\_instance\_profile](#input\_deployer\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"mx2-4x32"` | no | +| [dns\_custom\_resolver\_id](#input\_dns\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
storage = string
protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | +| [dns\_instance\_id](#input\_dns\_instance\_id) | IBM Cloud HPC DNS service instance id. | `string` | `null` | no | +| [dynamic\_compute\_instances](#input\_dynamic\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1024,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_bastion](#input\_enable\_bastion) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | Deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [existing\_resource\_group](#input\_existing\_resource\_group) | String describing resource groups to create or reference | `string` | `"Default"` | no | +| [file\_shares](#input\_file\_shares) | Custom file shares to access shared storage |
list(
object({
mount_path = string,
size = number,
iops = number
})
)
|
[
{
"iops": 1000,
"mount_path": "/mnt/binaries",
"size": 100
},
{
"iops": 1000,
"mount_path": "/mnt/data",
"size": 100
}
]
| no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | n/a | yes | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | n/a | yes | +| [key\_management](#input\_key\_management) | Set the value as key\_protect to enable customer managed encryption for boot volume and file share. If the key\_management is set as null, IBM Cloud resources will be always be encrypted through provider managed. | `string` | `"key_protect"` | no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [override](#input\_override) | Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment. | `bool` | `false` | no | +| [override\_json\_string](#input\_override\_json\_string) | Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes. | `string` | `null` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | +| [cluster_prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | `"lsf"` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.40.0/24"` | no | +| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the HPC cluster. | `list(string)` | `null` | no | +| [static\_compute\_instances](#input\_static\_compute\_instances) | Min Number of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. |
list(
object({
profile = string
count = number
image = string
filesystem_name = optional(string)
})
)
|
[
{
"count": 2,
"filesystem_name": "fs1",
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | `null` | no | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.30.0/24"` | no | +| [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | +| [zone](#input\_zone) | Zone where VPC will be created. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [hpcaas](#output\_hpcaas) | HPCaaS details | diff --git a/solutions/hpcaas/catalogValidationValues.json.template b/solutions/hpcaas/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/hpcaas/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/hpcaas/locals.tf b/solutions/hpcaas/locals.tf new file mode 100644 index 00000000..0d529799 --- /dev/null +++ b/solutions/hpcaas/locals.tf @@ -0,0 +1,96 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + deployer_instance = var.deployer_instance + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + compute_gui_password = var.compute_gui_password + compute_gui_username = var.compute_gui_username + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_names = var.dns_domain_names + dynamic_compute_instances = var.dynamic_compute_instances + enable_atracker = var.enable_atracker + # enable_bastion = var.enable_bastion + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + custom_file_shares = var.custom_file_shares + hpcs_instance_name = var.hpcs_instance_name + key_management = var.key_management + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + management_instances = var.management_instances + vpc_cidr = var.vpc_cidr + placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + static_compute_instances = var.static_compute_instances + storage_gui_password = var.storage_gui_password + storage_gui_username = var.storage_gui_username + storage_instances = var.storage_instances + storage_subnets_cidr = var.storage_subnets_cidr + vpc_name = var.vpc_name + } +} + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + compute_gui_password = lookup(local.override[local.override_type], "compute_gui_password", local.config.compute_gui_password) + compute_gui_username = lookup(local.override[local.override_type], "compute_gui_username", local.config.compute_gui_username) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_names = lookup(local.override[local.override_type], "dns_domain_names", local.config.dns_domain_names) + dynamic_compute_instances = lookup(local.override[local.override_type], "dynamic_compute_instances", local.config.dynamic_compute_instances) + enable_atracker = lookup(local.override[local.override_type], "enable_atracker", local.config.enable_atracker) + # enable_bastion = lookup(local.override[local.override_type], "enable_bastion", local.config.enable_bastion) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + custom_file_shares = lookup(local.override[local.override_type], "custom_file_shares", local.config.custom_file_shares) + hpcs_instance_name = lookup(local.override[local.override_type], "hpcs_instance_name", local.config.hpcs_instance_name) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + management_instances = lookup(local.override[local.override_type], "management_instances", local.config.management_instances) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + protocol_instances = lookup(local.override[local.override_type], "protocol_instances", local.config.protocol_instances) + protocol_subnets_cidr = lookup(local.override[local.override_type], "protocol_subnets_cidr", local.config.protocol_subnets_cidr) + static_compute_instances = lookup(local.override[local.override_type], "static_compute_instances", local.config.static_compute_instances) + storage_gui_password = lookup(local.override[local.override_type], "storage_gui_password", local.config.storage_gui_password) + storage_gui_username = lookup(local.override[local.override_type], "storage_gui_username", local.config.storage_gui_username) + storage_instances = lookup(local.override[local.override_type], "storage_instances", local.config.storage_instances) + storage_subnets_cidr = lookup(local.override[local.override_type], "storage_subnets_cidr", local.config.storage_subnets_cidr) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + } +} diff --git a/solutions/hpcaas/main.tf b/solutions/hpcaas/main.tf new file mode 100644 index 00000000..86e57581 --- /dev/null +++ b/solutions/hpcaas/main.tf @@ -0,0 +1,41 @@ +module "hpcaas" { + source = "./../.." + scheduler = "HPCaaS" + ibm_customer_number = var.ibm_customer_number + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + deployer_instance = local.env.deployer_instance + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_names + dynamic_compute_instances = local.env.dynamic_compute_instances + enable_atracker = local.env.enable_atracker + # enable_bastion = local.env.enable_bastion + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + custom_file_shares = local.env.custom_file_shares + key_management = local.env.key_management + client_instances = local.env.client_instances + management_instances = local.env.management_instances + vpc_cidr = local.env.vpc_cidr + placement_strategy = local.env.placement_strategy + protocol_instances = local.env.protocol_instances + protocol_subnets_cidr = [local.env.protocol_subnets_cidr] + static_compute_instances = local.env.static_compute_instances + storage_instances = local.env.storage_instances + storage_subnets_cidr = [local.env.storage_subnets_cidr] + vpc_name = local.env.vpc_name + + # compute_gui_password = local.env.compute_gui_password + # compute_gui_username = local.env.compute_gui_username + # hpcs_instance_name = local.env.hpcs_instance_name + # client_subnets_cidr = [local.env.client_subnets_cidr] + # storage_gui_password = local.env.storage_gui_password + # storage_gui_username = local.env.storage_gui_username +} diff --git a/solutions/hpcaas/outputs.tf b/solutions/hpcaas/outputs.tf new file mode 100644 index 00000000..54fa4c3d --- /dev/null +++ b/solutions/hpcaas/outputs.tf @@ -0,0 +1,4 @@ +output "hpcaas" { + description = "HPCaaS details" + value = module.hpcaas +} diff --git a/solutions/hpcaas/override.json b/solutions/hpcaas/override.json new file mode 100644 index 00000000..560c5a8b --- /dev/null +++ b/solutions/hpcaas/override.json @@ -0,0 +1,85 @@ +{ + "cluster_prefix": "hpcaas", + "existing_resource_group": "Default", + "vpc_name": null, + "vpc_cidr": "10.0.0.0/8", + "placement_strategy": null, + "ssh_keys": null, + "enable_bastion": true, + "enable_deployer": false, + "deployer_instance_profile": "mx2-4x32", + "vpc_cluster_login_private_subnets_cidr_blocks": "10.0.0.0/24", + "client_subnets_cidr": "10.10.10.0/24", + "client_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "vpc_cluster_private_subnets_cidr_blocks": "10.10.20.0/24", + "management_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "static_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 0, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "dynamic_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 5000, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "storage_subnets_cidr": "10.10.30.0/24", + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": "10.10.40.0/24", + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "storage_gui_username": "admin", + "custom_file_shares": [ + { + "mount_path": "/mnt/binaries", + "size": 100, + "iops": 1000 + }, + { + "mount_path": "/mnt/data", + "size": 100, + "iops": 1000 + } + ], + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_names": { + "compute": "comp.com", + "storage": "strg.com", + "protocol": "ces.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_atracker": true, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "hpcs_instance_name": null +} diff --git a/solutions/hpcaas/variables.tf b/solutions/hpcaas/variables.tf new file mode 100644 index 00000000..eededab6 --- /dev/null +++ b/solutions/hpcaas/variables.tf @@ -0,0 +1,405 @@ +############################################################################## +# Offering Variations +############################################################################## +variable "ibm_customer_number" { + type = string + sensitive = true + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." + validation { + condition = can(regex("^[0-9A-Za-z]*([0-9A-Za-z]+,[0-9A-Za-z]+)*$", var.ibm_customer_number)) + error_message = "The IBM customer number input value cannot have special characters." + } +} + +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "cluster_prefix" { + type = string + default = "lsf" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" + +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc_name)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +# variable "enable_bastion" { +# type = bool +# default = true +# description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false." +# } + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets_cidr" { + type = string + default = "10.241.50.0/24" + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for client." +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1024 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Scale Variables +############################################################################## +variable "storage_subnets_cidr" { + type = string + default = "10.241.30.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = string + default = "10.241.40.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for protocol hosts." +} + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "custom_file_shares" { + type = list( + object({ + mount_path = string, + size = number, + iops = number + }) + ) + default = [{ + mount_path = "/mnt/binaries" + size = 100 + iops = 1000 + }, { + mount_path = "/mnt/data" + size = 100 + iops = 1000 + }] + description = "Custom file shares to access shared storage" +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + client = string + gklm = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "hpcs_instance_name" { + type = string + default = null + description = "Hyper Protect Crypto Service instance" +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_atracker" { + type = bool + default = true + description = "Enable Activity tracker" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "Enable Activity tracker" +} + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} diff --git a/solutions/hpcaas/version.tf b/solutions/hpcaas/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/hpcaas/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/solutions/lsf/README.md b/solutions/lsf/README.md new file mode 100644 index 00000000..eb1c4dab --- /dev/null +++ b/solutions/lsf/README.md @@ -0,0 +1,101 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +| [ibm](#requirement\_ibm) | >= 1.68.1, < 2.0.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [lsf](#module\_lsf) | ./../.. | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` | n/a | yes | +| [bastion\_image](#input\_bastion\_image) | The image to use to deploy the bastion host. | `string` | `"ibm-ubuntu-22-04-3-minimal-amd64-1"` | no | +| [bastion\_instance\_profile](#input\_bastion\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"cx2-4x8"` | no | +| [bastion\_ssh\_keys](#input\_bastion\_ssh\_keys) | The key pair to use to access the bastion host. | `list(string)` | `null` | no | +| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `string` | `"10.0.0.0/24"` | no | +| [client\_instances](#input\_client\_instances) | Number of instances to be launched for client. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to launch the client host. | `list(string)` | `null` | no | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `string` | `"10.10.10.0/24"` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | `null` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `string` | `"10.10.20.0/24"` | no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [deployer\_image](#input\_deployer\_image) | The image to use to deploy the deployer host. | `string` | `"jay-lsf-new-image"` | no | +| [deployer\_instance\_profile](#input\_deployer\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"bx2-8x32"` | no | +| [dns\_custom\_resolver\_id](#input\_dns\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
storage = string
protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | +| [dns\_instance\_id](#input\_dns\_instance\_id) | IBM Cloud HPC DNS service instance id. | `string` | `null` | no | +| [dynamic\_compute\_instances](#input\_dynamic\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1024,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [enable\_bastion](#input\_enable\_bastion) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | Deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_hyperthreading](#input\_enable\_hyperthreading) | Setting this to true will enable hyper-threading in the worker nodes of the cluster (default). Otherwise, hyper-threading will be disabled. | `bool` | `true` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [existing\_resource\_group](#input\_existing\_resource\_group) | String describing resource groups to create or reference | `string` | `"Default"` | no | +| [file\_shares](#input\_file\_shares) | Custom file shares to access shared storage |
list(
object({
mount_path = string,
size = number,
iops = number
})
)
|
[
{
"iops": 1000,
"mount_path": "/mnt/binaries",
"size": 100
},
{
"iops": 1000,
"mount_path": "/mnt/data",
"size": 100
}
]
| no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | n/a | yes | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | n/a | yes | +| [key\_management](#input\_key\_management) | Set the value as key\_protect to enable customer managed encryption for boot volume and file share. If the key\_management is set as null, IBM Cloud resources will be always be encrypted through provider managed. | `string` | `"key_protect"` | no | +| [kms\_instance\_name](#input\_kms\_instance\_name) | Provide the name of the existing Key Protect instance associated with the Key Management Service. Note: To use existing kms\_instance\_name set key\_management as key\_protect. The name can be found under the details of the KMS, see [View key-protect ID](https://cloud.ibm.com/docs/key-protect?topic=key-protect-retrieve-instance-ID&interface=ui). | `string` | `null` | no | +| [kms\_key\_name](#input\_kms\_key\_name) | Provide the existing kms key name that you want to use for the IBM Cloud HPC cluster. Note: kms\_key\_name to be considered only if key\_management value is set as key\_protect.(for example kms\_key\_name: my-encryption-key). | `string` | `null` | no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [observability\_atracker\_enable](#input\_observability\_atracker\_enable) | Activity Tracker Event Routing to configure how to route auditing events. While multiple Activity Tracker instances can be created, only one tracker is needed to capture all events. Creating additional trackers is unnecessary if an existing Activity Tracker is already integrated with a COS bucket. In such cases, set the value to false, as all events can be monitored and accessed through the existing Activity Tracker. | `bool` | `true` | no | +| [observability\_atracker\_target\_type](#input\_observability\_atracker\_target\_type) | All the events will be stored in either COS bucket or Cloud Logs on the basis of user input, so customers can retrieve or ingest them in their system. | `string` | `"cloudlogs"` | no | +| [observability\_enable\_metrics\_routing](#input\_observability\_enable\_metrics\_routing) | Enable metrics routing to manage metrics at the account-level by configuring targets and routes that define where data points are routed. | `bool` | `false` | no | +| [observability\_enable\_platform\_logs](#input\_observability\_enable\_platform\_logs) | Setting this to true will create a tenant in the same region that the Cloud Logs instance is provisioned to enable platform logs for that region. NOTE: You can only have 1 tenant per region in an account. | `bool` | `false` | no | +| [observability\_logs\_enable\_for\_compute](#input\_observability\_logs\_enable\_for\_compute) | Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested. | `bool` | `false` | no | +| [observability\_logs\_enable\_for\_management](#input\_observability\_logs\_enable\_for\_management) | Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested. | `bool` | `false` | no | +| [observability\_logs\_retention\_period](#input\_observability\_logs\_retention\_period) | The number of days IBM Cloud Logs will retain the logs data in Priority insights. Allowed values: 7, 14, 30, 60, 90. | `number` | `7` | no | +| [observability\_monitoring\_enable](#input\_observability\_monitoring\_enable) | Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure and LSF application metrics from Management Nodes will be ingested. | `bool` | `true` | no | +| [observability\_monitoring\_on\_compute\_nodes\_enable](#input\_observability\_monitoring\_on\_compute\_nodes\_enable) | Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure metrics from Compute Nodes will be ingested. | `bool` | `false` | no | +| [observability\_monitoring\_plan](#input\_observability\_monitoring\_plan) | Type of service plan for IBM Cloud Monitoring instance. You can choose one of the following: lite, graduated-tier. For all details visit [IBM Cloud Monitoring Service Plans](https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans). | `string` | `"graduated-tier"` | no | +| [override](#input\_override) | Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment. | `bool` | `false` | no | +| [override\_json\_string](#input\_override\_json\_string) | Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes. | `string` | `null` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | +| [cluster_prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | `"lsf"` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.40.0/24"` | no | +| [scc\_enable](#input\_scc\_enable) | Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created. | `bool` | `true` | no | +| [scc\_event\_notification\_plan](#input\_scc\_event\_notification\_plan) | Event Notifications Instance plan to be used (it's used with S.C.C. instance), possible values 'lite' and 'standard'. | `string` | `"lite"` | no | +| [scc\_location](#input\_scc\_location) | Location where the SCC instance is provisioned (possible choices 'us-south', 'eu-de', 'ca-tor', 'eu-es') | `string` | `"us-south"` | no | +| [scc\_profile](#input\_scc\_profile) | Profile to be set on the SCC Instance (accepting empty, 'CIS IBM Cloud Foundations Benchmark' and 'IBM Cloud Framework for Financial Services') | `string` | `"CIS IBM Cloud Foundations Benchmark v1.1.0"` | no | +| [skip\_flowlogs\_s2s\_auth\_policy](#input\_skip\_flowlogs\_s2s\_auth\_policy) | Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account. | `bool` | `false` | no | +| [skip\_iam\_authorization\_policy](#input\_skip\_iam\_authorization\_policy) | Set to false if authorization policy is required for VPC block storage volumes to access kms. This can be set to true if authorization policy already exists. For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui). | `bool` | `false` | no | +| [skip\_kms\_s2s\_auth\_policy](#input\_skip\_kms\_s2s\_auth\_policy) | Skip auth policy between KMS service and COS instance, set to true if this policy is already in place on account. | `bool` | `false` | no | +| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the HPC cluster. | `list(string)` | `null` | no | +| [static\_compute\_instances](#input\_static\_compute\_instances) | Min Number of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. |
list(
object({
profile = string
count = number
image = string
filesystem_name = optional(string)
})
)
|
[
{
"count": 2,
"filesystem_name": "fs1",
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | `null` | no | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.30.0/24"` | no | +| [vpc\_name](#input\_vpc\_name) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | +| [zones](#input\_zones) | Region where VPC will be created. To find your VPC region, use `ibmcloud is regions` command to find available regions. | `list(string)` |
[
"us-south-1",
"us-south-2",
"us-south-3"
]
| no | + +## Outputs + +| Name | Description | +|------|-------------| +| [lsf](#output\_lsf) | LSF details | diff --git a/solutions/lsf/catalogValidationValues.json.template b/solutions/lsf/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/lsf/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/lsf/datasource.tf b/solutions/lsf/datasource.tf new file mode 100644 index 00000000..afdf5435 --- /dev/null +++ b/solutions/lsf/datasource.tf @@ -0,0 +1,18 @@ +data "ibm_is_subnet" "existing_login_subnets" { + count = var.vpc_name != null && var.login_subnet_id != null ? 1 : 0 + identifier = var.login_subnet_id +} + +data "ibm_is_vpc" "existing_vpc" { + count = var.vpc_name != null ? 1 : 0 + name = var.vpc_name +} + +data "ibm_is_subnet" "existing_cluster_subnets" { + count = var.vpc_name != null && var.cluster_subnet_id != null ? 1 : 0 + identifier = var.cluster_subnet_id +} + +data "ibm_is_public_gateways" "public_gateways" { + count = var.vpc_name != null && var.cluster_subnet_id == null && var.login_subnet_id == null ? 1 : 0 +} diff --git a/solutions/lsf/input_validation.tf b/solutions/lsf/input_validation.tf new file mode 100644 index 00000000..64e6f141 --- /dev/null +++ b/solutions/lsf/input_validation.tf @@ -0,0 +1,57 @@ +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### +# This file contains the complete information on all the validations performed from the code during the generate plan process +# Validations are performed to make sure, the appropriate error messages are displayed to user in-order to provide required input parameter + +locals { + + # Validate existing login subnet should be in the appropriate zone. + validate_login_subnet_id_zone_msg = "Provided login subnet should be in appropriate zone." + validate_login_subnet_id_zone = anytrue([var.login_subnet_id == null, var.login_subnet_id != null && var.vpc_name != null ? alltrue([data.ibm_is_subnet.existing_login_subnets[0].zone == var.zones[0]]) : false]) + # tflint-ignore: terraform_unused_declarations + validate_login_subnet_id_zone_chk = regex("^${local.validate_login_subnet_id_zone_msg}$", + (local.validate_login_subnet_id_zone ? local.validate_login_subnet_id_zone_msg : "")) + + # Validate existing login subnet should be the subset of vpc_name entered + validate_login_subnet_id_vpc_msg = "Provided login subnet should be within the vpc entered." + validate_login_subnet_id_vpc = anytrue([var.login_subnet_id == null, var.login_subnet_id != null && var.vpc_name != null ? alltrue([for subnet_id in [var.login_subnet_id] : contains(data.ibm_is_vpc.existing_vpc[0].subnets[*].id, subnet_id)]) : false]) + # tflint-ignore: terraform_unused_declarations + validate_login_subnet_id_vpc_chk = regex("^${local.validate_login_subnet_id_vpc_msg}$", + (local.validate_login_subnet_id_vpc ? local.validate_login_subnet_id_vpc_msg : "")) + + # Validate existing subnet public gateways + validate_subnet_name_pg_msg = "Provided existing cluster_subnet_id should have public gateway attached." + validate_subnet_name_pg = anytrue([var.cluster_subnet_id == null, var.cluster_subnet_id != null && var.vpc_name != null ? (data.ibm_is_subnet.existing_cluster_subnets[0].public_gateway != "") : false]) + # tflint-ignore: terraform_unused_declarations + validate_subnet_name_pg_chk = regex("^${local.validate_subnet_name_pg_msg}$", + (local.validate_subnet_name_pg ? local.validate_subnet_name_pg_msg : "")) + + # Validate existing cluster subnet should be in the appropriate zone. + validate_subnet_id_zone_msg = "Provided cluster subnets should be in appropriate zone." + validate_subnet_id_zone = anytrue([var.cluster_subnet_id == null, var.cluster_subnet_id != null && var.vpc_name != null ? alltrue([data.ibm_is_subnet.existing_cluster_subnets[0].zone == var.zones[0]]) : false]) + # tflint-ignore: terraform_unused_declarations + validate_subnet_id_zone_chk = regex("^${local.validate_subnet_id_zone_msg}$", + (local.validate_subnet_id_zone ? local.validate_subnet_id_zone_msg : "")) + + # Validate existing cluster subnet should be the subset of vpc_name entered + validate_cluster_subnet_id_vpc_msg = "Provided cluster subnet should be within the vpc entered." + validate_cluster_subnet_id_vpc = anytrue([var.cluster_subnet_id == null, var.cluster_subnet_id != null && var.vpc_name != null ? alltrue([for subnet_id in [var.cluster_subnet_id] : contains(data.ibm_is_vpc.existing_vpc[0].subnets[*].id, subnet_id)]) : false]) + # tflint-ignore: terraform_unused_declarations + validate_subnet_id_vpc_chk = regex("^${local.validate_cluster_subnet_id_vpc_msg}$", + (local.validate_cluster_subnet_id_vpc ? local.validate_cluster_subnet_id_vpc_msg : "")) + + # Validate existing vpc public gateways + validate_existing_vpc_pgw_msg = "Provided existing vpc should have the public gateways created in the provided zones." + validate_existing_vpc_pgw = anytrue([(var.vpc_name == null), alltrue([var.vpc_name != null, var.cluster_subnet_id != null]), alltrue([var.vpc_name != null, var.cluster_subnet_id == null, var.login_subnet_id == null, length(local.zone_1_pgw_ids) > 0])]) + # tflint-ignore: terraform_unused_declarations + validate_existing_vpc_pgw_chk = regex("^${local.validate_existing_vpc_pgw_msg}$", + (local.validate_existing_vpc_pgw ? local.validate_existing_vpc_pgw_msg : "")) +} + +locals { + vpc_id = var.vpc_name != null && var.cluster_subnet_id == null && var.login_subnet_id == null ? data.ibm_is_vpc.existing_vpc[0].id : null + public_gateways_list = var.vpc_name != null && var.cluster_subnet_id == null && var.login_subnet_id == null ? data.ibm_is_public_gateways.public_gateways[0].public_gateways : [] + zone_1_pgw_ids = var.vpc_name != null && var.cluster_subnet_id == null && var.login_subnet_id == null ? [for gateway in local.public_gateways_list : gateway.id if gateway.vpc == local.vpc_id && gateway.zone == var.zones[0]] : [] +} diff --git a/solutions/lsf/locals.tf b/solutions/lsf/locals.tf new file mode 100644 index 00000000..ce4d1c41 --- /dev/null +++ b/solutions/lsf/locals.tf @@ -0,0 +1,173 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cluster_subnet_id = var.cluster_subnet_id + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_name = var.dns_domain_name + dynamic_compute_instances = var.dynamic_compute_instances + bastion_instance = var.bastion_instance + login_subnet_id = var.login_subnet_id + deployer_instance = var.deployer_instance + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + custom_file_shares = var.custom_file_shares + storage_security_group_id = var.storage_security_group_id + key_management = var.key_management + management_instances = var.management_instances + vpc_cidr = var.vpc_cidr + # placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + static_compute_instances = var.static_compute_instances + vpc_name = var.vpc_name + kms_instance_name = var.kms_instance_name + kms_key_name = var.kms_key_name + skip_iam_share_authorization_policy = var.skip_iam_share_authorization_policy + observability_atracker_enable = var.observability_atracker_enable + observability_atracker_target_type = var.observability_atracker_target_type + observability_monitoring_enable = var.observability_monitoring_enable + observability_logs_enable_for_management = var.observability_logs_enable_for_management + observability_logs_enable_for_compute = var.observability_logs_enable_for_compute + observability_enable_platform_logs = var.observability_enable_platform_logs + observability_enable_metrics_routing = var.observability_enable_metrics_routing + observability_logs_retention_period = var.observability_logs_retention_period + observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable + observability_monitoring_plan = var.observability_monitoring_plan + skip_flowlogs_s2s_auth_policy = var.skip_flowlogs_s2s_auth_policy + skip_iam_block_storage_authorization_policy = var.skip_iam_block_storage_authorization_policy + skip_kms_s2s_auth_policy = var.skip_kms_s2s_auth_policy + ibmcloud_api_key = var.ibmcloud_api_key + app_center_gui_password = var.app_center_gui_password + lsf_version = var.lsf_version + enable_hyperthreading = var.enable_hyperthreading + enable_ldap = var.enable_ldap + ldap_basedns = var.ldap_basedns + ldap_admin_password = var.ldap_admin_password + ldap_user_name = var.ldap_user_name + ldap_user_password = var.ldap_user_password + ldap_server = var.ldap_server + ldap_server_cert = var.ldap_server_cert + ldap_instance = var.ldap_instance + enable_dedicated_host = var.enable_dedicated_host + existing_bastion_instance_name = var.existing_bastion_instance_name + existing_bastion_instance_public_ip = var.existing_bastion_instance_public_ip + existing_bastion_security_group_id = var.existing_bastion_security_group_id + existing_bastion_ssh_private_key = var.existing_bastion_ssh_private_key + login_instance = var.login_instance + vpn_enabled = var.vpn_enabled + sccwp_service_plan = var.sccwp_service_plan + sccwp_enable = var.sccwp_enable + cspm_enabled = var.cspm_enabled + app_config_plan = var.app_config_plan + + } +} + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + login_subnet_id = lookup(local.override[local.override_type], "login_subnet_id", local.config.login_subnet_id) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cluster_subnet_id = lookup(local.override[local.override_type], "cluster_subnet_id", local.config.cluster_subnet_id) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_name = lookup(local.override[local.override_type], "dns_domain_name", local.config.dns_domain_name) + dynamic_compute_instances = lookup(local.override[local.override_type], "dynamic_compute_instances", local.config.dynamic_compute_instances) + bastion_instance = lookup(local.override[local.override_type], "bastion_instance", local.config.bastion_instance) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + custom_file_shares = lookup(local.override[local.override_type], "custom_file_shares", local.config.custom_file_shares) + storage_security_group_id = lookup(local.override[local.override_type], "storage_security_group_id", local.config.storage_security_group_id) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + management_instances = lookup(local.override[local.override_type], "management_instances", local.config.management_instances) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + # placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + static_compute_instances = lookup(local.override[local.override_type], "static_compute_instances", local.config.static_compute_instances) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + kms_instance_name = lookup(local.override[local.override_type], "kms_instance_name", local.config.kms_instance_name) + kms_key_name = lookup(local.override[local.override_type], "kms_key_name", local.config.kms_key_name) + skip_iam_share_authorization_policy = lookup(local.override[local.override_type], "skip_iam_share_authorization_policy", local.config.skip_iam_share_authorization_policy) + observability_atracker_enable = lookup(local.override[local.override_type], "observability_atracker_enable", local.config.observability_atracker_enable) + observability_atracker_target_type = lookup(local.override[local.override_type], "observability_atracker_target_type", local.config.observability_atracker_target_type) + observability_monitoring_enable = lookup(local.override[local.override_type], "observability_monitoring_enable", local.config.observability_monitoring_enable) + observability_logs_enable_for_management = lookup(local.override[local.override_type], "observability_logs_enable_for_management", local.config.observability_logs_enable_for_management) + observability_logs_enable_for_compute = lookup(local.override[local.override_type], "observability_logs_enable_for_compute", local.config.observability_logs_enable_for_compute) + observability_enable_platform_logs = lookup(local.override[local.override_type], "observability_enable_platform_logs", local.config.observability_enable_platform_logs) + observability_enable_metrics_routing = lookup(local.override[local.override_type], "observability_enable_metrics_routing", local.config.observability_enable_metrics_routing) + observability_logs_retention_period = lookup(local.override[local.override_type], "observability_logs_retention_period", local.config.observability_logs_retention_period) + observability_monitoring_on_compute_nodes_enable = lookup(local.override[local.override_type], "observability_monitoring_on_compute_nodes_enable", local.config.observability_monitoring_on_compute_nodes_enable) + observability_monitoring_plan = lookup(local.override[local.override_type], "observability_monitoring_plan", local.config.observability_monitoring_plan) + skip_flowlogs_s2s_auth_policy = lookup(local.override[local.override_type], "skip_flowlogs_s2s_auth_policy", local.config.skip_flowlogs_s2s_auth_policy) + skip_iam_block_storage_authorization_policy = lookup(local.override[local.override_type], "skip_iam_block_storage_authorization_policy", local.config.skip_iam_block_storage_authorization_policy) + skip_kms_s2s_auth_policy = lookup(local.override[local.override_type], "skip_kms_s2s_auth_policy", local.config.skip_kms_s2s_auth_policy) + ibmcloud_api_key = lookup(local.override[local.override_type], "ibmcloud_api_key", local.config.ibmcloud_api_key) + app_center_gui_password = lookup(local.override[local.override_type], "app_center_gui_password", local.config.app_center_gui_password) + lsf_version = lookup(local.override[local.override_type], "lsf_version", local.config.lsf_version) + enable_hyperthreading = lookup(local.override[local.override_type], "enable_hyperthreading", local.config.enable_hyperthreading) + enable_ldap = lookup(local.override[local.override_type], "enable_ldap", local.config.enable_ldap) + vpn_enabled = lookup(local.override[local.override_type], "vpn_enabled", local.config.vpn_enabled) + ldap_basedns = lookup(local.override[local.override_type], "ldap_basedns", local.config.ldap_basedns) + ldap_admin_password = lookup(local.override[local.override_type], "ldap_admin_password", local.config.ldap_admin_password) + ldap_user_name = lookup(local.override[local.override_type], "ldap_user_name", local.config.ldap_user_name) + ldap_user_password = lookup(local.override[local.override_type], "ldap_user_password", local.config.ldap_user_password) + ldap_server = lookup(local.override[local.override_type], "ldap_server", local.config.ldap_server) + ldap_server_cert = lookup(local.override[local.override_type], "ldap_server_cert", local.config.ldap_server_cert) + ldap_instance = lookup(local.override[local.override_type], "ldap_instance", local.config.ldap_instance) + enable_dedicated_host = lookup(local.override[local.override_type], "enable_dedicated_host", local.config.enable_dedicated_host) + existing_bastion_instance_name = lookup(local.override[local.override_type], "existing_bastion_instance_name", local.config.existing_bastion_instance_name) + existing_bastion_instance_public_ip = lookup(local.override[local.override_type], "existing_bastion_instance_public_ip", local.config.existing_bastion_instance_public_ip) + existing_bastion_security_group_id = lookup(local.override[local.override_type], "existing_bastion_security_group_id", local.config.existing_bastion_security_group_id) + existing_bastion_ssh_private_key = lookup(local.override[local.override_type], "existing_bastion_ssh_private_key", local.config.existing_bastion_ssh_private_key) + login_instance = lookup(local.override[local.override_type], "login_instance", local.config.login_instance) + sccwp_enable = lookup(local.override[local.override_type], "scc_wp_enable", local.config.sccwp_enable) + cspm_enable = lookup(local.override[local.override_type], "cspm_enable", local.config.cspm_enabled) + sccwp_service_plan = lookup(local.override[local.override_type], "scc_wp_service_plan", local.config.sccwp_service_plan) + app_config_plan = lookup(local.override[local.override_type], "app_config_plan", local.config.app_config_plan) + # client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + # client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + } +} +locals { + custom_fileshare_iops_range = [ + [10, 39, 100, 1000], + [40, 79, 100, 2000], + [80, 99, 100, 4000], + [100, 499, 100, 6000], + [500, 999, 100, 10000], + [1000, 1999, 100, 20000], + [2000, 3999, 200, 40000], + [4000, 7999, 300, 40000], + [8000, 15999, 500, 64000], + [16000, 32000, 2000, 96000] + ] +} diff --git a/solutions/lsf/main.tf b/solutions/lsf/main.tf new file mode 100644 index 00000000..774e2dec --- /dev/null +++ b/solutions/lsf/main.tf @@ -0,0 +1,68 @@ +module "lsf" { + source = "./../.." + scheduler = "LSF" + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + login_subnet_id = local.env.login_subnet_id + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cluster_subnet_id = local.env.cluster_subnet_id + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_name + dynamic_compute_instances = local.env.dynamic_compute_instances + bastion_instance = local.env.bastion_instance + deployer_instance = local.env.deployer_instance + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + custom_file_shares = local.env.custom_file_shares + storage_security_group_id = local.env.storage_security_group_id + key_management = local.env.key_management + management_instances = local.env.management_instances + vpc_cidr = local.env.vpc_cidr + static_compute_instances = local.env.static_compute_instances + vpc_name = local.env.vpc_name + kms_instance_name = local.env.kms_instance_name + kms_key_name = local.env.kms_key_name + skip_iam_share_authorization_policy = local.env.skip_iam_share_authorization_policy + observability_atracker_enable = local.env.observability_atracker_enable + observability_atracker_target_type = local.env.observability_atracker_target_type + observability_monitoring_enable = local.env.observability_monitoring_enable + observability_monitoring_on_compute_nodes_enable = local.env.observability_monitoring_on_compute_nodes_enable + observability_logs_enable_for_management = local.env.observability_logs_enable_for_management + observability_logs_enable_for_compute = local.env.observability_logs_enable_for_compute + observability_enable_platform_logs = local.env.observability_enable_platform_logs + observability_enable_metrics_routing = local.env.observability_enable_metrics_routing + observability_logs_retention_period = local.env.observability_logs_retention_period + observability_monitoring_plan = local.env.observability_monitoring_plan + skip_flowlogs_s2s_auth_policy = local.env.skip_flowlogs_s2s_auth_policy + skip_iam_block_storage_authorization_policy = local.env.skip_iam_block_storage_authorization_policy + skip_kms_s2s_auth_policy = local.env.skip_kms_s2s_auth_policy + ibmcloud_api_key = local.env.ibmcloud_api_key + app_center_gui_password = local.env.app_center_gui_password + lsf_version = local.env.lsf_version + enable_hyperthreading = local.env.enable_hyperthreading + enable_ldap = local.env.enable_ldap + ldap_basedns = local.env.ldap_basedns + ldap_admin_password = local.env.ldap_admin_password + ldap_user_name = local.env.ldap_user_name + ldap_user_password = local.env.ldap_user_password + ldap_server = local.env.ldap_server + ldap_server_cert = local.env.ldap_server_cert + ldap_instance = local.env.ldap_instance + enable_dedicated_host = local.env.enable_dedicated_host + existing_bastion_instance_name = local.env.existing_bastion_instance_name + existing_bastion_instance_public_ip = local.env.existing_bastion_instance_public_ip + existing_bastion_security_group_id = local.env.existing_bastion_security_group_id + existing_bastion_ssh_private_key = local.env.existing_bastion_ssh_private_key + vpn_enabled = local.env.vpn_enabled + login_instance = local.env.login_instance + sccwp_enable = local.env.sccwp_enable + sccwp_service_plan = local.env.sccwp_service_plan + cspm_enabled = var.cspm_enabled + app_config_plan = var.app_config_plan +} diff --git a/solutions/lsf/outputs.tf b/solutions/lsf/outputs.tf new file mode 100644 index 00000000..e69de29b diff --git a/solutions/lsf/override.json b/solutions/lsf/override.json new file mode 100644 index 00000000..ddfc48ca --- /dev/null +++ b/solutions/lsf/override.json @@ -0,0 +1,108 @@ +{ + "cluster_prefix": "lsf", + "lsf_version": "fixpack_15", + "existing_resource_group": "Default", + "vpc_name": null, + "vpc_cidr": "10.0.0.0/8", + "placement_strategy": null, + "ssh_keys": null, + "enable_bastion": true, + "enable_deployer": false, + "bastion_instance": { + "profile": "cx2-4x8", + "image": "ibm-ubuntu-22-04-5-minimal-amd64-3" + }, + "deployer_instance": { + "profile": "bx2-8x32", + "image": "hpc-lsf-fp15-deployer-rhel810-v1" + }, + "vpc_cluster_login_private_subnets_cidr_blocks": "10.0.0.0/24", + "vpc_cluster_private_subnets_cidr_blocks": "10.10.20.0/24", + "management_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "static_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 0, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "dynamic_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 5000, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "enable_hyperthreading": true, + "storage_subnets_cidr": "10.10.30.0/24", + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "storage_servers": [ + { + "profile": "cx2d-metal-96x192", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": "10.10.40.0/24", + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "storage_gui_username": "admin", + "custom_file_shares": [ + { + "mount_path": "/mnt/binaries", + "size": 100, + "iops": 1000 + }, + { + "mount_path": "/mnt/data", + "size": 100, + "iops": 1000 + } + ], + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_name": { + "compute": "comp.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "kms_instance_name": null, + "kms_key_name": null, + "observability_atracker_enable": true, + "observability_atracker_target_type": "cloudlogs", + "observability_monitoring_enable": true, + "observability_logs_enable_for_management": false, + "observability_logs_enable_for_compute": false, + "observability_enable_platform_logs": false, + "observability_enable_metrics_routing": false, + "observability_logs_retention_period": 7, + "observability_monitoring_on_compute_nodes_enable": false, + "observability_monitoring_plan": "graduated-tier", + "scc_enable": true, + "scc_profile": "CIS IBM Cloud Foundations Benchmark v1.1.0", + "scc_location": "us-south", + "scc_event_notification_plan": "lite", + "skip_flowlogs_s2s_auth_policy": false, + "skip_kms_s2s_auth_policy": false, + "skip_iam_authorization_policy": false +} diff --git a/solutions/lsf/variables.tf b/solutions/lsf/variables.tf new file mode 100644 index 00000000..0cde4d3e --- /dev/null +++ b/solutions/lsf/variables.tf @@ -0,0 +1,840 @@ +############################################################################## +# Mandatory Required variables +############################################################################## +variable "ibmcloud_api_key" { + description = "Provide the IBM Cloud API key associated with the account to deploy the IBM Spectrum LSF cluster. This key is used to authenticate your deployment and grant the necessary access to create and manage resources in your IBM Cloud environment, see [Managing user API keys](https://cloud.ibm.com/docs/account?topic=account-userapikey)." + type = string + sensitive = true + validation { + condition = var.ibmcloud_api_key != "" + error_message = "The API key for IBM Cloud must be set." + } +} + +variable "lsf_version" { + type = string + default = "fixpack_15" + description = "Select the desired version of IBM Spectrum LSF to deploy either fixpack_15 or fixpack_14. By default, the solution uses the latest available version, which is Fix Pack 15. If you need to deploy an earlier version such as Fix Pack 14, update the lsf_version field to fixpack_14. When changing the LSF version, ensure that all custom images used for management, compute, and login nodes correspond to the same version. This is essential to maintain compatibility across the cluster and to prevent deployment issues." + + validation { + condition = contains(["fixpack_14", "fixpack_15"], var.lsf_version) + error_message = "Invalid LSF version. Allowed values are 'fixpack_14' and 'fixpack_15'" + } +} + +variable "app_center_gui_password" { + type = string + default = "" + sensitive = true + description = "Password required to access the IBM Spectrum LSF Application Center (App Center) GUI, which is enabled by default in both Fix Pack 15 and Fix Pack 14 with HTTPS. This is a mandatory value and omitting it will result in deployment failure. The password must meet the following requirements, at least 8 characters in length, and must include one uppercase letter, one lowercase letter, one number, and one special character. Spaces are not allowed." + + validation { + condition = ( + can(regex("^.{8,}$", var.app_center_gui_password)) && + can(regex("[0-9]", var.app_center_gui_password)) && + can(regex("[a-z]", var.app_center_gui_password)) && + can(regex("[A-Z]", var.app_center_gui_password)) && + can(regex("[!@#$%^&*()_+=-]", var.app_center_gui_password)) && + !can(regex(".*\\s.*", var.app_center_gui_password)) + ) + error_message = "The password must be at least 8 characters long and include at least one lowercase letter, one uppercase letter, one number, and one special character (!@#$%^&*()_+=-). Spaces are not allowed." + } +} + + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "Provide the list of SSH key names already configured in your IBM Cloud account to establish a connection to the Spectrum LSF nodes. Solution does not create new SSH keys, provide the existing keys. Make sure the SSH key exists in the same resource group and region where the cluster is being provisioned. To pass multiple SSH keys, use the format [\"key-name-1\", \"key-name-2\"]. If you don't have an SSH key in your IBM Cloud account, you can create one by following the provided .[SSH Keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys)." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +############################################################################## +# Prefix Variables +############################################################################## +variable "cluster_prefix" { + description = "This prefix uniquely identifies the IBM Cloud Spectrum LSF cluster and its resources, it must always be unique. The name must start with a lowercase letter and can include only lowercase letters, digits, and hyphens. Hyphens must be followed by a lowercase letter or digit, with no leading, trailing, or consecutive hyphens. The prefix length must be less than 16 characters." + type = string + default = "hpc-lsf" + + validation { + error_message = "Prefix must start with a lowercase letter and contain only lowercase letters, digits, and hyphens in between. Hyphens must be followed by at least one lowercase letter or digit. There are no leading, trailing, or consecutive hyphens." + condition = can(regex("^[a-z](?:[a-z0-9]*(-[a-z0-9]+)*)?$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + description = "Specify the name of the existing resource group in your IBM Cloud account where VPC resources will be deployed. By default, the resource group is set to 'Default.' In some older accounts, it may be 'default,' so please verify the resource group name before proceeding. If the value is set to \"null\", the automation will create two separate resource groups: 'workload-rg' and 'service-rg.' For more details, see Managing resource groups." + type = string + default = "Default" + validation { + condition = var.existing_resource_group != null + error_message = "If you want to provide null for resource_group variable, it should be within double quotes." + } +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Provide the name of an existing VPC in which the cluster resources will be deployed. If no value is given, solution provisions a new VPC. [Learn more](https://cloud.ibm.com/docs/vpc)." +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "An address prefix is created for the new VPC when the vpc_name variable is set to null. This prefix is required to provision subnets within a single zone, and the subnets will be created using the specified CIDR blocks. For more information, see [Setting IP ranges](https://cloud.ibm.com/docs/vpc?topic=vpc-vpc-addressing-plan-design)." +} + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Specify the CIDR block for the private subnet used by the login cluster. Only a single CIDR block is required. In hybrid environments, ensure the CIDR range does not overlap with any on-premises networks. Since this subnet is dedicated to login virtual server instances, a /28 CIDR range is recommended." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "login_subnet_id" { + type = string + default = null + description = "Provide the ID of an existing subnet to deploy cluster resources, this is used only for provisioning bastion, deployer, and login nodes. If not provided, new subnet will be created.When providing an existing subnet ID, make sure that the subnet has an associated public gateway..[Learn more](https://cloud.ibm.com/docs/vpc)." + validation { + condition = (var.cluster_subnet_id == null && var.login_subnet_id == null) || (var.cluster_subnet_id != null && var.login_subnet_id != null) + error_message = "In case of existing subnets, provide both login_subnet_id and cluster_subnet_id." + } +} + +variable "cluster_subnet_id" { + type = string + default = null + description = "Provide the ID of an existing subnet to deploy cluster resources; this is used only for provisioning VPC file storage shares, management, and compute nodes. If not provided, a new subnet will be created. Ensure that a public gateway is attached to enable VPC API communication. [Learn more](https://cloud.ibm.com/docs/vpc)." + validation { + condition = anytrue([var.vpc_name != null && var.cluster_subnet_id != null, var.cluster_subnet_id == null]) + error_message = "If the cluster_subnet_id are provided, the user should also provide the vpc_name." + } +} +############################################################################## +# Bastion/Deployer Variables +############################################################################## + +variable "bastion_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "ibm-ubuntu-22-04-5-minimal-amd64-3" + profile = "cx2-4x8" + } + description = "Configuration for the bastion node, including the image and instance profile. Only Ubuntu 22.04 stock images are supported." + + validation { + condition = can(regex("^ibm-ubuntu", var.bastion_instance.image)) + error_message = "Only IBM Ubuntu stock images are supported for the Bastion node." + } + + validation { + condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.bastion_instance.profile)) + error_message = "The profile must be a valid virtual server instance profile." + } +} + +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, deployer node is created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp15-deployer-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures." + validation { + condition = contains([ + "hpc-lsf-fp15-deployer-rhel810-v1", + "hpc-lsf-fp14-deployer-rhel810-v1" + ], var.deployer_instance.image) + error_message = "Invalid deployer image. Allowed values for fixpack_15 is 'hpc-lsf-fp15-deployer-rhel810-v1' and for fixpack_14 is 'hpc-lsf-fp14-deployer-rhel810-v1'." + } + validation { + condition = ( + (!can(regex("fp15", var.deployer_instance.image)) || var.lsf_version == "fixpack_15") && + (!can(regex("fp14", var.deployer_instance.image)) || var.lsf_version == "fixpack_14") + ) + error_message = "Mismatch between deployer_instance.image and lsf_version. Use an image with 'fp14' only when lsf_version is fixpack_14, and 'fp15' only with fixpack_15." + } + validation { + condition = can(regex("^[^\\s]+-[0-9]+x[0-9]+", var.deployer_instance.profile)) + error_message = "The profile must be a valid virtual server instance profile." + } +} + +############################################################################## +# LSF Cluster Variables +############################################################################## + +variable "login_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + image = "hpc-lsf-fp15-compute-rhel810-v1" + }] + description = "Specify the list of login node configurations, including instance profile, image name. By default, login node is created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures." + validation { + condition = alltrue([ + for inst in var.login_instance : can(regex("^[^\\s]+-[0-9]+x[0-9]+", inst.profile)) + ]) + error_message = "The profile must be a valid virtual server instance profile." + } + validation { + condition = alltrue([ + for inst in var.login_instance : ( + (!can(regex("fp15", inst.image)) || var.lsf_version == "fixpack_15") && + (!can(regex("fp14", inst.image)) || var.lsf_version == "fixpack_14") + ) + ]) + error_message = "Mismatch between login_instance image and lsf_version. Use an image with 'fp14' only when lsf_version is fixpack_14, and 'fp15' only with fixpack_15." + } +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-16x64" + count = 2 + image = "hpc-lsf-fp15-rhel810-v1" + }] + description = "Specify the list of management node configurations, including instance profile, image name, and count. By default, all management nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. The solution allows customization of instance profiles and counts, but mixing custom images and IBM stock images across instances is not supported. If using IBM stock images, only Red Hat-based images are allowed." + validation { + condition = alltrue([for inst in var.management_instances : !contains([for i in var.management_instances : can(regex("^ibm", i.image))], true) || can(regex("^ibm-redhat", inst.image))]) + error_message = "When defining management_instances, all instances must either use custom images or IBM stock images exclusively — mixing the two is not supported. If stock images are used, only Red Hat-based IBM images (e.g., ibm-redhat-*) are allowed." + } + validation { + condition = alltrue([ + for inst in var.management_instances : can(regex("^[^\\s]+-[0-9]+x[0-9]+", inst.profile)) + ]) + error_message = "The profile must be a valid virtual server instance profile." + } + validation { + condition = sum([for inst in var.management_instances : inst.count]) <= 10 + error_message = "The total number of management node instances (sum of counts) must not exceed 10." + } + validation { + condition = alltrue([ + for inst in var.management_instances : ( + (!can(regex("fp15", inst.image)) || var.lsf_version == "fixpack_15") && + (!can(regex("fp14", inst.image)) || var.lsf_version == "fixpack_14") + ) + ]) + error_message = "Mismatch between management_instances image and lsf_version. Use an image with 'fp14' only when lsf_version is fixpack_14, and 'fp15' only with fixpack_15." + } +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-4x16" + count = 1 + image = "hpc-lsf-fp15-compute-rhel810-v1" + }] + description = "Specify the list of static compute node configurations, including instance profile, image name, and count. By default, all compute nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. The solution allows customization of instance profiles and counts, but mixing custom images and IBM stock images across instances is not supported. If using IBM stock images, only Red Hat-based images are allowed." + validation { + condition = alltrue([ + for inst in var.static_compute_instances : + # If any instance uses IBM stock image, all must use it, and it should be redhat. + (!contains([for i in var.static_compute_instances : can(regex("^ibm-", i.image))], true) || can(regex("^ibm-redhat", inst.image))) + ]) + error_message = "When defining static_compute_instances, all instances must either use custom images or IBM stock images exclusively—mixing the two is not supported. If stock images are used, only Red Hat-based IBM images (e.g., ibm-redhat-*) are allowed." + } + validation { + condition = alltrue([ + for inst in var.static_compute_instances : can(regex("^[^\\s]+-[0-9]+x[0-9]+", inst.profile)) + ]) + error_message = "The profile must be a valid virtual server instance profile." + } + validation { + condition = alltrue([ + for inst in var.static_compute_instances : ( + (!can(regex("fp15", inst.image)) || var.lsf_version == "fixpack_15") && + (!can(regex("fp14", inst.image)) || var.lsf_version == "fixpack_14") + ) + ]) + error_message = "Mismatch between static_compute_instances image and lsf_version. Use an image with 'fp14' only when lsf_version is fixpack_14, and 'fp15' only with fixpack_15." + } +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-4x16" + count = 500 + image = "hpc-lsf-fp15-compute-rhel810-v1" + }] + description = "Specify the list of dynamic compute node configurations, including instance profile, image name, and count. By default, all dynamic compute nodes are created using Fix Pack 15. If deploying with Fix Pack 14, set lsf_version to fixpack_14 and use the corresponding image hpc-lsf-fp14-compute-rhel810-v1. The selected image must align with the specified lsf_version, any mismatch may lead to deployment failures. Currently, only a single instance profile is supported for dynamic compute nodes—multiple profiles are not yet supported.." + validation { + condition = alltrue([ + for inst in var.dynamic_compute_instances : can(regex("^[^\\s]+-[0-9]+x[0-9]+", inst.profile)) + ]) + error_message = "The profile must be a valid virtual server instance profile." + } + validation { + condition = length(var.dynamic_compute_instances) == 1 + error_message = "Only a single map (one instance profile) is allowed for dynamic compute instances." + } + validation { + condition = alltrue([ + for inst in var.dynamic_compute_instances : ( + (!can(regex("fp15", inst.image)) || var.lsf_version == "fixpack_15") && + (!can(regex("fp14", inst.image)) || var.lsf_version == "fixpack_14") + ) + ]) + error_message = "Mismatch between dynamic_compute_instances image and lsf_version. Use an image with 'fp14' only when lsf_version is fixpack_14, and 'fp15' only with fixpack_15." + } +} + +############################################################################## +# File share variables +############################################################################## +variable "storage_security_group_id" { + type = string + default = null + description = "Provide the storage security group ID from the Spectrum Scale storage cluster when an nfs_share value is specified for a given mount_path in the cluster_file_share variable. This security group is necessary to enable network connectivity between the Spectrum LSF cluster nodes and the NFS mount point, ensuring successful access to the shared file system." + validation { + condition = length([for share in var.custom_file_shares : share.nfs_share if share.nfs_share != null && share.nfs_share != ""]) == 0 || var.storage_security_group_id != null + error_message = "Storage security group ID cannot be null when NFS share mount path is provided under cluster_file_shares variable." + } +} + +variable "custom_file_shares" { + type = list(object({ + mount_path = string, + size = optional(number), + iops = optional(number), + nfs_share = optional(string) + })) + default = [{ mount_path = "/mnt/vpcstorage/tools", size = 100, iops = 2000 }, { mount_path = "/mnt/vpcstorage/data", size = 100, iops = 6000 }, { mount_path = "/mnt/scale/tools", nfs_share = "" }] + description = "Provide details for customizing your shared file storage layout, including mount points, sizes (in GB), and IOPS ranges for up to five file shares if using VPC file storage as the storage option.If using IBM Storage Scale as an NFS mount, update the appropriate mount path and nfs_share values created from the Storage Scale cluster. Note that VPC file storage supports attachment to a maximum of 256 nodes. Exceeding this limit may result in mount point failures due to attachment restrictions.For more information, see [Storage options](https://cloud.ibm.com/docs/hpc-ibm-spectrumlsf?topic=hpc-ibm-spectrumlsf-integrating-scale#integrate-scale-and-hpc)." + validation { + condition = length([for item in var.custom_file_shares : item if item.nfs_share == null]) <= 5 + error_message = "The VPC storage custom file share count \"custom_file_shares\" must be less than or equal to 5. Unlimited NFS mounts are allowed." + } + validation { + condition = length([for mounts in var.custom_file_shares : mounts.mount_path]) == length(toset([for mounts in var.custom_file_shares : mounts.mount_path])) + error_message = "Mount path values should not be duplicated." + } + validation { + condition = alltrue([for mounts in var.custom_file_shares : can(mounts.size) && mounts.size != null ? (10 <= mounts.size && mounts.size <= 32000) : true]) + error_message = "The custom_file_share size must be greater than or equal to 10 and less than or equal to 32000." + } + validation { + condition = alltrue([ + for share in var.custom_file_shares : ( + share.size != null && share.iops != null ? + anytrue([ + for r in local.custom_fileshare_iops_range : + share.size >= r[0] && share.size <= r[1] && share.iops >= r[2] && share.iops <= r[3] + ]) : true + ) + ]) + error_message = "Provided iops value is not valid for given file share size. Please refer 'File Storage for VPC profiles' page in IBM Cloud docs for a valid IOPS and size combination." + } +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "Specify the ID of an existing IBM Cloud DNS service instance. When provided, domain names are created within the specified instance. If set to null, a new DNS service instance is created, and the required DNS zones are associated with it." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "Specify the ID of an existing IBM Cloud DNS custom resolver to avoid creating a new one. If set to null, a new custom resolver will be created and associated with the VPC. Note: A VPC can be associated with only one custom resolver. When using an existing VPC, if a custom resolver is already associated and this ID is not provided, the deployment will fail." + validation { + condition = var.vpc_name != null || var.dns_custom_resolver_id == null + error_message = "If this is a new VPC deployment (vpc_name is null), do not provide dns_custom_resolver_id, as it may impact name resolution." + } +} + +variable "dns_domain_name" { + type = object({ + compute = string + }) + default = { + compute = "lsf.com" + } + description = "IBM Cloud DNS Services domain name to be used for the IBM Spectrum LSF cluster." + validation { + condition = can(regex("^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.com$", var.dns_domain_name.compute)) + error_message = "The compute domain name must be a valid FQDN ending in '.com'. It may include letters, digits, hyphens, and must start and end with an alphanumeric character." + } +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null', null, or 'key_protect'." + } + validation { + condition = ( + var.kms_instance_name == null && + (var.key_management == "null" || var.key_management == null || var.key_management == "key_protect") + ) || ( + var.kms_instance_name != null && var.key_management == "key_protect" + ) + error_message = "If kms_instance_name is provided, key_management must be 'key_protect'. If kms_instance_name is null, key_management can be 'key_protect', 'null' (string), or null (literal)." + } +} + +variable "kms_instance_name" { + type = string + default = null + description = "Provide the name of the existing Key Protect instance associated with the Key Management Service. Note: To use existing kms_instance_name set key_management as key_protect. The name can be found under the details of the KMS, see [View key-protect ID](https://cloud.ibm.com/docs/key-protect?topic=key-protect-retrieve-instance-ID&interface=ui)." +} + +variable "kms_key_name" { + type = string + default = null + description = "Provide the existing kms key name that you want to use for the IBM Cloud HPC cluster. Note: kms_key_name to be considered only if key_management value is set as key_protect.(for example kms_key_name: my-encryption-key)." + validation { + condition = anytrue([alltrue([var.kms_key_name != null, var.kms_instance_name != null]), (var.kms_key_name == null), (var.key_management != "key_protect")]) + error_message = "Please make sure you are passing the kms_instance_name if you are passing kms_key_name." + } +} + +variable "skip_iam_share_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the VPC file share. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." +} + +############################################################################## +# LDAP Variables +############################################################################## +variable "enable_ldap" { + type = bool + default = false + description = "Set this option to true to enable LDAP for IBM Spectrum LSF, with the default value set to false." +} + +variable "ldap_basedns" { + type = string + default = "lsf.com" + description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." + validation { + condition = var.enable_ldap == false || (var.ldap_basedns != null ? (length(trimspace(var.ldap_basedns)) > 0 && var.ldap_basedns != "null") : false) + error_message = "If LDAP is enabled, then the base DNS should not be empty or null. Need a valid domain name." + } +} + +variable "ldap_server" { + type = string + default = null + description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." + validation { + condition = var.enable_ldap == false || var.ldap_server == null || (var.ldap_server != null ? (length(trimspace(var.ldap_server)) > 0 && var.ldap_server != "null") : true) + error_message = "If LDAP is enabled, an existing LDAP server IP should be provided." + } +} + +variable "ldap_server_cert" { + type = string + sensitive = true + default = null + description = "Provide the existing LDAP server certificate. This value is required if the 'ldap_server' variable is not set to null. If the certificate is not provided or is invalid, the LDAP configuration may fail. For more information on how to create or obtain the certificate, please refer [existing LDAP server certificate](https://cloud.ibm.com/docs/allowlist/hpc-service?topic=hpc-service-integrating-openldap)." + validation { + condition = var.enable_ldap == false || var.ldap_server == null || (var.ldap_server_cert != null ? (length(trimspace(var.ldap_server_cert)) > 0 && var.ldap_server_cert != "null") : false) + error_message = "Provide the current LDAP server certificate. This is required if 'ldap_server' is set; otherwise, the LDAP configuration will not succeed." + } +} + +variable "ldap_admin_password" { + type = string + sensitive = true + default = null + description = "The LDAP admin password must be 8 to 20 characters long and include at least two alphabetic characters (with one uppercase and one lowercase), one number, and one special character from the set (!@#$%^&*()_+=-). The password must not contain the username or any spaces. [This value is ignored for an existing LDAP server]." + validation { + condition = (!var.enable_ldap || var.ldap_server != null || can(var.ldap_admin_password != null && length(var.ldap_admin_password) >= 8 && length(var.ldap_admin_password) <= 20 && regex(".*[0-9].*", var.ldap_admin_password) != "" && regex(".*[A-Z].*", var.ldap_admin_password) != "" && regex(".*[a-z].*", var.ldap_admin_password) != "" && regex(".*[!@#$%^&*()_+=-].*", var.ldap_admin_password) != "" && !can(regex(".*\\s.*", var.ldap_admin_password)))) + error_message = "The LDAP admin password must be 8 to 20 characters long and include at least two alphabetic characters (with one uppercase and one lowercase), one number, and one special character from the set (!@#$%^&*()_+=-). The password must not contain the username or any spaces." + } +} + +variable "ldap_user_name" { + type = string + default = "" + description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server]" + validation { + condition = var.enable_ldap == false || var.ldap_server != null || (length(var.ldap_user_name) >= 4 && length(var.ldap_user_name) <= 32 && var.ldap_user_name != "" && can(regex("^[a-zA-Z0-9_-]*$", var.ldap_user_name)) && trimspace(var.ldap_user_name) != "") + error_message = "LDAP username must be between 4-32 characters long and can only contain letters, numbers, hyphens, and underscores. Spaces are not permitted." + } +} + +variable "ldap_user_password" { + type = string + sensitive = true + default = "" + description = "The LDAP user password must be 8 to 20 characters long and include at least two alphabetic characters (with one uppercase and one lowercase), one numeric digit, and at least one special character from the set (!@#$%^&*()_+=-). Spaces are not allowed. The password must not contain the username for enhanced security. [This value is ignored for an existing LDAP server]." + validation { + condition = !var.enable_ldap || var.ldap_server != null || ((replace(lower(var.ldap_user_password), lower(var.ldap_user_name), "") == lower(var.ldap_user_password)) && length(var.ldap_user_password) >= 8 && length(var.ldap_user_password) <= 20 && can(regex("^(.*[0-9]){1}.*$", var.ldap_user_password))) && can(regex("^(.*[A-Z]){1}.*$", var.ldap_user_password)) && can(regex("^(.*[a-z]){1}.*$", var.ldap_user_password)) && can(regex("^.*[!@#$%^&*()_+=-].*$", var.ldap_user_password)) && !can(regex(".*\\s.*", var.ldap_user_password)) + error_message = "The LDAP user password must be 8 to 20 characters long and include at least two alphabetic characters (with one uppercase and one lowercase), one number, and one special character from the set (!@#$%^&*()_+=-). The password must not contain the username or any spaces." + } +} + +variable "ldap_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + image = "ibm-ubuntu-22-04-5-minimal-amd64-3" + }] + description = "Specify the compute instance profile and image to be used for deploying LDAP instances. Only Debian-based operating systems, such as Ubuntu, are supported for LDAP functionality." + validation { + condition = alltrue([ + for inst in var.ldap_instance : can(regex("^[^\\s]+-[0-9]+x[0-9]+", inst.profile)) + ]) + error_message = "The profile must be a valid virtual server instance profile." + } +} + + +############################################################################## +# Additional feature Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Set to true to create an extra cos bucket to integrate with HPC cluster deployment." +} + +variable "cos_instance_name" { + type = string + default = null + description = "Provide the name of the existing COS instance where the logs for the enabled functionalities will be stored." +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "This flag determines whether VPC flow logs are enabled. When set to true, a flow log collector will be created to capture and monitor network traffic data within the VPC. Enabling flow logs provides valuable insights for troubleshooting, performance monitoring, and security auditing by recording information about the traffic passing through your VPC. Consider enabling this feature to enhance visibility and maintain robust network management practices." +} + +variable "vpn_enabled" { + type = bool + default = false + description = "Set the value as true to deploy a VPN gateway for VPC in the cluster." +} + +variable "enable_hyperthreading" { + type = bool + default = true + description = "Setting this to true will enable hyper-threading in the worker nodes of the cluster (default). Otherwise, hyper-threading will be disabled." +} +############################################################################## +# Observability Variables +############################################################################## + +variable "observability_atracker_enable" { + type = bool + default = true + description = "Activity Tracker Event Routing to configure how to route auditing events. While multiple Activity Tracker instances can be created, only one tracker is needed to capture all events. Creating additional trackers is unnecessary if an existing Activity Tracker is already integrated with a COS bucket. In such cases, set the value to false, as all events can be monitored and accessed through the existing Activity Tracker." +} + +variable "observability_atracker_target_type" { + type = string + default = "cloudlogs" + description = "Specify the target where Atracker events will be stored—either IBM Cloud Logs or a Cloud Object Storage (COS) bucket—based on the selected value. This allows the logs to be accessed or integrated with external systems." + validation { + condition = contains(["cloudlogs", "cos"], var.observability_atracker_target_type) + error_message = "Allowed values for atracker target type is cloudlogs and cos." + } +} + +variable "observability_monitoring_enable" { + description = "Enables or disables IBM Cloud Monitoring integration. When enabled, metrics from both the infrastructure and LSF application running on Management Nodes will be collected. This must be set to true if monitoring is required on management nodes." + type = bool + default = true + validation { + condition = var.observability_monitoring_enable == true || var.observability_monitoring_on_compute_nodes_enable == false + error_message = "To enable monitoring on compute nodes, IBM Cloud Monitoring must also be enabled." + } +} + +variable "observability_logs_enable_for_management" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested." + type = bool + default = false +} + +variable "observability_logs_enable_for_compute" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested." + type = bool + default = false +} + +variable "observability_enable_platform_logs" { + description = "Setting this to true will create a tenant in the same region that the Cloud Logs instance is provisioned to enable platform logs for that region. NOTE: You can only have 1 tenant per region in an account." + type = bool + default = false +} + +variable "observability_enable_metrics_routing" { + description = "Enable metrics routing to manage metrics at the account-level by configuring targets and routes that define where data points are routed." + type = bool + default = false +} + +variable "observability_logs_retention_period" { + description = "The number of days IBM Cloud Logs will retain the logs data in Priority insights. Allowed values: 7, 14, 30, 60, 90." + type = number + default = 7 + validation { + condition = contains([7, 14, 30, 60, 90], var.observability_logs_retention_period) + error_message = "Allowed values for cloud logs retention period is 7, 14, 30, 60, 90." + } +} + +variable "observability_monitoring_on_compute_nodes_enable" { + description = "Enables or disables IBM Cloud Monitoring integration. When enabled, metrics from both the infrastructure and LSF application running on compute Nodes will be collected. This must be set to true if monitoring is required on compute nodes." + type = bool + default = false +} + +variable "observability_monitoring_plan" { + description = "Type of service plan for IBM Cloud Monitoring instance. You can choose one of the following: lite, graduated-tier. For all details visit [IBM Cloud Monitoring Service Plans](https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans)." + type = string + default = "graduated-tier" + validation { + condition = can(regex("lite|graduated-tier", var.observability_monitoring_plan)) + error_message = "Please enter a valid plan for IBM Cloud Monitoring, for all details visit https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans." + } +} + +variable "skip_flowlogs_s2s_auth_policy" { + type = bool + default = false + description = "When using an existing COS instance, set this value to true if authorization is already enabled between COS instance and the flow logs service. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment." +} + +variable "skip_kms_s2s_auth_policy" { + type = bool + default = false + description = "When using an existing COS instance, set this value to true if authorization is already enabled between COS instance and the kms. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment." +} + +variable "skip_iam_block_storage_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the block storage volume. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." +} + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} + +############################################################################## +# Dedicatedhost Variables +############################################################################## + +variable "enable_dedicated_host" { + type = bool + default = false + description = "Set this option to true to enable dedicated hosts for the VSIs provisioned as workload servers. The default value is false. When dedicated hosts are enabled, multiple vsi instance profiles from the same or different families (e.g., bx2, cx2, mx2) can be used. If you plan to deploy a static cluster with a third-generation profile, ensure that dedicated host support is available in the selected region, as not all regions support third-gen profiles on dedicated hosts. To learn more about dedicated host, [click here.](https://cloud.ibm.com/docs/vpc?topic=vpc-dh-profiles&interface=ui)." +} + +########################################################################### +# Existing Bastion Support variables +########################################################################### + +variable "existing_bastion_instance_name" { + type = string + default = null + description = "Provide the name of the bastion instance. If none given then new bastion will be created." + validation { + condition = var.existing_bastion_instance_name == null || ( + var.existing_bastion_instance_public_ip != null && + var.existing_bastion_security_group_id != null && + var.existing_bastion_ssh_private_key != null + ) + error_message = "If bastion_instance_name is set, then bastion_instance_public_ip, bastion_security_group_id, and bastion_ssh_private_key must also be provided." + } +} + +variable "existing_bastion_instance_public_ip" { + type = string + default = null + description = "Provide the public ip address of the existing bastion instance to establish the remote connection. Also using this public ip address, connection to the LSF cluster nodes shall be established" +} + +variable "existing_bastion_security_group_id" { + type = string + default = null + description = "Specify the security group ID for the bastion server. This ID will be added as an allowlist rule on the HPC cluster nodes to facilitate secure SSH connections through the bastion node. By restricting access through a bastion server, this setup enhances security by controlling and monitoring entry points into the cluster environment. Ensure that the specified security group is correctly configured to permit only authorized traffic for secure and efficient management of cluster resources." +} + +variable "existing_bastion_ssh_private_key" { + type = string + sensitive = true + default = null + description = "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." +} + +############################################################################## +# Environment Variables +############################################################################## + +# tflint-ignore: all +variable "TF_VERSION" { + type = string + default = "1.9" + description = "The version of the Terraform engine that's used in the Schematics workspace." +} + +# tflint-ignore: all +variable "TF_PARALLELISM" { + type = string + default = "250" + description = "Parallelism/ concurrent operations limit. Valid values are between 1 and 256, both inclusive. [Learn more](https://www.terraform.io/docs/internals/graph.html#walking-the-graph)." + validation { + condition = 1 <= var.TF_PARALLELISM && var.TF_PARALLELISM <= 256 + error_message = "Input \"TF_PARALLELISM\" must be greater than or equal to 1 and less than or equal to 256." + } +} + +############################################################################## +# SCC Variables +############################################################################## + +variable "sccwp_service_plan" { + description = "Specify the plan type for the Security and Compliance Center (SCC) Workload Protection instance. Valid values are free-trial and graduated-tier only." + type = string + default = "free-trial" + validation { + error_message = "Plan for SCC Workload Protection instances can only be `free-trial` or `graduated-tier`." + condition = contains( + ["free-trial", "graduated-tier"], + var.sccwp_service_plan + ) + } +} + +variable "sccwp_enable" { + type = bool + default = true + description = "Set this flag to true to create an instance of IBM Security and Compliance Center (SCC) Workload Protection. When enabled, it provides tools to discover and prioritize vulnerabilities, monitor for security threats, and enforce configuration, permission, and compliance policies across the full lifecycle of your workloads. To view the data on the dashboard, enable the cspm to create the app configuration and required trusted profile policies.[Learn more](https://cloud.ibm.com/docs/workload-protection?topic=workload-protection-about)." +} + +variable "cspm_enabled" { + description = "CSPM (Cloud Security Posture Management) is a set of tools and practices that continuously monitor and secure cloud infrastructure. When enabled, it creates a trusted profile with viewer access to the App Configuration and Enterprise services for the SCC Workload Protection instance. Make sure the required IAM permissions are in place, as missing permissions will cause deployment to fail. If CSPM is disabled, dashboard data will not be available.[Learn more](https://cloud.ibm.com/docs/workload-protection?topic=workload-protection-about)." + type = bool + default = true + nullable = false +} + +variable "app_config_plan" { + description = "Specify the IBM service pricing plan for the app configuration. Allowed values are 'basic', 'lite', 'standardv2', 'enterprise'." + type = string + default = "basic" + validation { + error_message = "Plan for App configuration can only be basic, lite, standardv2, enterprise.." + condition = contains( + ["basic", "lite", "standardv2", "enterprise"], + var.app_config_plan + ) + } +} diff --git a/solutions/lsf/version.tf b/solutions/lsf/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/lsf/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/solutions/scale/README.md b/solutions/scale/README.md new file mode 100644 index 00000000..122d80c7 --- /dev/null +++ b/solutions/scale/README.md @@ -0,0 +1,79 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [ibm](#requirement\_ibm) | >= 1.68.1, < 2.0.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [scale](#module\_scale) | ./../.. | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` | n/a | yes | +| [bastion\_image](#input\_bastion\_image) | The image to use to deploy the bastion host. | `string` | `"ibm-ubuntu-22-04-3-minimal-amd64-1"` | no | +| [bastion\_instance\_profile](#input\_bastion\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"cx2-4x8"` | no | +| [bastion\_ssh\_keys](#input\_bastion\_ssh\_keys) | The key pair to use to access the bastion host. | `list(string)` | `null` | no | +| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `string` | `"10.0.0.0/24"` | no | +| [client\_instances](#input\_client\_instances) | Number of instances to be launched for client. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to launch the client host. | `list(string)` | `null` | no | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `string` | `"10.10.10.0/24"` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_instances](#input\_compute\_instances) | Total Number of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 3,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | `null` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `string` | `"10.10.20.0/24"` | no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [deployer\_image](#input\_deployer\_image) | The image to use to deploy the deployer host. | `string` | `"ibm-redhat-8-10-minimal-amd64-2"` | no | +| [deployer\_instance\_profile](#input\_deployer\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"mx2-4x32"` | no | +| [dns\_custom\_resolver\_id](#input\_dns\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
storage = string
protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | +| [dns\_instance\_id](#input\_dns\_instance\_id) | IBM Cloud HPC DNS service instance id. | `string` | `null` | no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_bastion](#input\_enable\_bastion) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | Deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [existing\_resource\_group](#input\_existing\_resource\_group) | String describing resource groups to create or reference | `string` | `"Default"` | no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | n/a | yes | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | n/a | yes | +| [key\_management](#input\_key\_management) | Set the value as key\_protect to enable customer managed encryption for boot volume and file share. If the key\_management is set as null, IBM Cloud resources will be always be encrypted through provider managed. | `string` | `"key_protect"` | no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [override](#input\_override) | Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment. | `bool` | `false` | no | +| [override\_json\_string](#input\_override\_json\_string) | Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes. | `string` | `null` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | +| [cluster_prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | `"scale"` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.40.0/24"` | no | +| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the HPC cluster. | `list(string)` | `null` | no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. |
list(
object({
profile = string
count = number
image = string
filesystem = optional(string)
})
)
|
[
{
"count": 2,
"filesystem": "fs1",
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | `null` | no | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.30.0/24"` | no | +| [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | +| [zone](#input\_zone) | Zone where VPC will be created. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [scale](#output\_scale) | Scale details | diff --git a/solutions/scale/catalogValidationValues.json.template b/solutions/scale/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/scale/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/scale/input_validation.tf b/solutions/scale/input_validation.tf new file mode 100644 index 00000000..066abd17 --- /dev/null +++ b/solutions/scale/input_validation.tf @@ -0,0 +1,13 @@ +################################################### +# Copyright (C) IBM Corp. 2023 All Rights Reserved. +# Licensed under the Apache License v2.0 +################################################### +# This file contains the complete information on all the validations performed from the code during the generate plan process +# Validations are performed to make sure, the appropriate error messages are displayed to user in-order to provide required input parameter + +locals { + icn_cnd = (var.storage_type != "evaluation" && var.ibm_customer_number == null) ? false : true + icn_msg = "The IBM customer number input value can't be empty when storage_type is not evaluation." + # tflint-ignore: terraform_unused_declarations + icn_chk = regex("^${local.icn_msg}$", (local.icn_cnd ? local.icn_msg : "")) +} diff --git a/solutions/scale/locals.tf b/solutions/scale/locals.tf new file mode 100644 index 00000000..90f43d9c --- /dev/null +++ b/solutions/scale/locals.tf @@ -0,0 +1,170 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + compute_gui_password = var.compute_gui_password + compute_gui_username = var.compute_gui_username + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_names = var.dns_domain_names + enable_atracker = var.enable_atracker + # enable_bastion = var.enable_bastion + bastion_instance = var.bastion_instance + deployer_instance = var.deployer_instance + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + hpcs_instance_name = var.hpcs_instance_name + key_management = var.key_management + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + vpc_cidr = var.vpc_cidr + placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + compute_instances = var.compute_instances + storage_gui_password = var.storage_gui_password + storage_gui_username = var.storage_gui_username + storage_instances = var.storage_instances + storage_servers = var.storage_servers + storage_subnets_cidr = var.storage_subnets_cidr + vpc_name = var.vpc_name + observability_atracker_enable = var.observability_atracker_enable + observability_atracker_target_type = var.observability_atracker_target_type + observability_monitoring_enable = var.observability_monitoring_enable + observability_logs_enable_for_management = var.observability_logs_enable_for_management + observability_logs_enable_for_compute = var.observability_logs_enable_for_compute + observability_enable_platform_logs = var.observability_enable_platform_logs + observability_enable_metrics_routing = var.observability_enable_metrics_routing + observability_logs_retention_period = var.observability_logs_retention_period + observability_monitoring_on_compute_nodes_enable = var.observability_monitoring_on_compute_nodes_enable + observability_monitoring_plan = var.observability_monitoring_plan + skip_flowlogs_s2s_auth_policy = var.skip_flowlogs_s2s_auth_policy + skip_kms_s2s_auth_policy = var.skip_kms_s2s_auth_policy + skip_iam_block_storage_authorization_policy = var.skip_iam_block_storage_authorization_policy + ibmcloud_api_key = var.ibmcloud_api_key + afm_instances = var.afm_instances + afm_cos_config = var.afm_cos_config + enable_ldap = var.enable_ldap + ldap_basedns = var.ldap_basedns + ldap_admin_password = var.ldap_admin_password + ldap_user_name = var.ldap_user_name + ldap_user_password = var.ldap_user_password + ldap_server = var.ldap_server + ldap_server_cert = var.ldap_server_cert + ldap_instance = var.ldap_instance + scale_encryption_enabled = var.scale_encryption_enabled + scale_encryption_type = var.scale_encryption_type + gklm_instance_key_pair = var.gklm_instance_key_pair + gklm_instances = var.gklm_instances + storage_type = var.storage_type + colocate_protocol_instances = var.colocate_protocol_instances + scale_encryption_admin_default_password = var.scale_encryption_admin_default_password + scale_encryption_admin_password = var.scale_encryption_admin_password + scale_encryption_admin_username = var.scale_encryption_admin_username + filesystem_config = var.filesystem_config + existing_bastion_instance_name = var.existing_bastion_instance_name + existing_bastion_instance_public_ip = var.existing_bastion_instance_public_ip + existing_bastion_security_group_id = var.existing_bastion_security_group_id + existing_bastion_ssh_private_key = var.existing_bastion_ssh_private_key + } +} + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + compute_gui_password = lookup(local.override[local.override_type], "compute_gui_password", local.config.compute_gui_password) + compute_gui_username = lookup(local.override[local.override_type], "compute_gui_username", local.config.compute_gui_username) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_names = lookup(local.override[local.override_type], "dns_domain_names", local.config.dns_domain_names) + enable_atracker = lookup(local.override[local.override_type], "enable_atracker", local.config.enable_atracker) + # enable_bastion = lookup(local.override[local.override_type], "enable_bastion", local.config.enable_bastion) + bastion_instance = lookup(local.override[local.override_type], "bastion_instance", local.config.bastion_instance) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + hpcs_instance_name = lookup(local.override[local.override_type], "hpcs_instance_name", local.config.hpcs_instance_name) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + protocol_instances = lookup(local.override[local.override_type], "protocol_instances", local.config.protocol_instances) + protocol_subnets_cidr = lookup(local.override[local.override_type], "protocol_subnets_cidr", local.config.protocol_subnets_cidr) + compute_instances = lookup(local.override[local.override_type], "compute_instances", local.config.compute_instances) + storage_gui_password = lookup(local.override[local.override_type], "storage_gui_password", local.config.storage_gui_password) + storage_gui_username = lookup(local.override[local.override_type], "storage_gui_username", local.config.storage_gui_username) + storage_instances = lookup(local.override[local.override_type], "storage_instances", local.config.storage_instances) + storage_servers = lookup(local.override[local.override_type], "storage_servers", local.config.storage_servers) + storage_subnets_cidr = lookup(local.override[local.override_type], "storage_subnets_cidr", local.config.storage_subnets_cidr) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + observability_atracker_enable = lookup(local.override[local.override_type], "observability_atracker_enable", local.config.observability_atracker_enable) + observability_atracker_target_type = lookup(local.override[local.override_type], "observability_atracker_target_type", local.config.observability_atracker_target_type) + observability_monitoring_enable = lookup(local.override[local.override_type], "observability_monitoring_enable", local.config.observability_monitoring_enable) + observability_logs_enable_for_management = lookup(local.override[local.override_type], "observability_logs_enable_for_management", local.config.observability_logs_enable_for_management) + observability_logs_enable_for_compute = lookup(local.override[local.override_type], "observability_logs_enable_for_compute", local.config.observability_logs_enable_for_compute) + observability_enable_platform_logs = lookup(local.override[local.override_type], "observability_enable_platform_logs", local.config.observability_enable_platform_logs) + observability_enable_metrics_routing = lookup(local.override[local.override_type], "observability_enable_metrics_routing", local.config.observability_enable_metrics_routing) + observability_logs_retention_period = lookup(local.override[local.override_type], "observability_logs_retention_period", local.config.observability_logs_retention_period) + observability_monitoring_on_compute_nodes_enable = lookup(local.override[local.override_type], "observability_monitoring_on_compute_nodes_enable", local.config.observability_monitoring_on_compute_nodes_enable) + observability_monitoring_plan = lookup(local.override[local.override_type], "observability_monitoring_plan", local.config.observability_monitoring_plan) + skip_flowlogs_s2s_auth_policy = lookup(local.override[local.override_type], "skip_flowlogs_s2s_auth_policy", local.config.skip_flowlogs_s2s_auth_policy) + skip_kms_s2s_auth_policy = lookup(local.override[local.override_type], "skip_kms_s2s_auth_policy", local.config.skip_kms_s2s_auth_policy) + skip_iam_block_storage_authorization_policy = lookup(local.override[local.override_type], "skip_iam_block_storage_authorization_policy", local.config.skip_iam_block_storage_authorization_policy) + ibmcloud_api_key = lookup(local.override[local.override_type], "ibmcloud_api_key", local.config.ibmcloud_api_key) + afm_instances = lookup(local.override[local.override_type], "afm_instances", local.config.afm_instances) + afm_cos_config = lookup(local.override[local.override_type], "afm_cos_config", local.config.afm_cos_config) + enable_ldap = lookup(local.override[local.override_type], "enable_ldap", local.config.enable_ldap) + ldap_basedns = lookup(local.override[local.override_type], "ldap_basedns", local.config.ldap_basedns) + ldap_admin_password = lookup(local.override[local.override_type], "ldap_admin_password", local.config.ldap_admin_password) + ldap_user_name = lookup(local.override[local.override_type], "ldap_user_name", local.config.ldap_user_name) + ldap_user_password = lookup(local.override[local.override_type], "ldap_user_password", local.config.ldap_user_password) + ldap_server = lookup(local.override[local.override_type], "ldap_server", local.config.ldap_server) + ldap_server_cert = lookup(local.override[local.override_type], "ldap_server_cert", local.config.ldap_server_cert) + ldap_instance = lookup(local.override[local.override_type], "ldap_instance", local.config.ldap_instance) + scale_encryption_enabled = lookup(local.override[local.override_type], "scale_encryption_enabled", local.config.scale_encryption_enabled) + scale_encryption_type = lookup(local.override[local.override_type], "scale_encryption_type", local.config.scale_encryption_type) + gklm_instance_key_pair = lookup(local.override[local.override_type], "gklm_instance_key_pair", local.config.gklm_instance_key_pair) + gklm_instances = lookup(local.override[local.override_type], "gklm_instances", local.config.gklm_instances) + storage_type = lookup(local.override[local.override_type], "storage_type", local.config.storage_type) + colocate_protocol_instances = lookup(local.override[local.override_type], "colocate_protocol_instances", local.config.colocate_protocol_instances) + scale_encryption_admin_default_password = lookup(local.override[local.override_type], "scale_encryption_admin_default_password", local.config.scale_encryption_admin_default_password) + scale_encryption_admin_password = lookup(local.override[local.override_type], "scale_encryption_admin_password", local.config.scale_encryption_admin_password) + scale_encryption_admin_username = lookup(local.override[local.override_type], "scale_encryption_admin_username", local.config.scale_encryption_admin_username) + filesystem_config = lookup(local.override[local.override_type], "filesystem_config", local.config.filesystem_config) + existing_bastion_instance_name = lookup(local.override[local.override_type], "existing_bastion_instance_name", local.config.existing_bastion_instance_name) + existing_bastion_instance_public_ip = lookup(local.override[local.override_type], "existing_bastion_instance_public_ip", local.config.existing_bastion_instance_public_ip) + existing_bastion_security_group_id = lookup(local.override[local.override_type], "existing_bastion_security_group_id", local.config.existing_bastion_security_group_id) + existing_bastion_ssh_private_key = lookup(local.override[local.override_type], "existing_bastion_ssh_private_key", local.config.existing_bastion_ssh_private_key) + } +} diff --git a/solutions/scale/main.tf b/solutions/scale/main.tf new file mode 100644 index 00000000..5c86c9ed --- /dev/null +++ b/solutions/scale/main.tf @@ -0,0 +1,77 @@ +module "scale" { + source = "./../.." + scheduler = "Scale" + ibm_customer_number = var.ibm_customer_number + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_names + enable_atracker = local.env.enable_atracker + # enable_bastion = local.env.enable_bastion + bastion_instance = local.env.bastion_instance + deployer_instance = local.env.deployer_instance + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + key_management = local.env.key_management + client_instances = local.env.client_instances + vpc_cidr = local.env.vpc_cidr + placement_strategy = local.env.placement_strategy + protocol_instances = local.env.protocol_instances + protocol_subnets_cidr = [local.env.protocol_subnets_cidr] + colocate_protocol_instances = local.env.colocate_protocol_instances + static_compute_instances = local.env.compute_instances + storage_instances = local.env.storage_instances + storage_servers = local.env.storage_servers + storage_subnets_cidr = [local.env.storage_subnets_cidr] + vpc_name = local.env.vpc_name + compute_gui_password = local.env.compute_gui_password + compute_gui_username = local.env.compute_gui_username + storage_gui_password = local.env.storage_gui_password + storage_gui_username = local.env.storage_gui_username + observability_atracker_enable = local.env.observability_atracker_enable + observability_atracker_target_type = local.env.observability_atracker_target_type + observability_monitoring_enable = local.env.observability_monitoring_enable + observability_logs_enable_for_management = local.env.observability_logs_enable_for_management + observability_logs_enable_for_compute = local.env.observability_logs_enable_for_compute + observability_enable_platform_logs = local.env.observability_enable_platform_logs + observability_enable_metrics_routing = local.env.observability_enable_metrics_routing + observability_logs_retention_period = local.env.observability_logs_retention_period + observability_monitoring_on_compute_nodes_enable = local.env.observability_monitoring_on_compute_nodes_enable + observability_monitoring_plan = local.env.observability_monitoring_plan + skip_flowlogs_s2s_auth_policy = local.env.skip_flowlogs_s2s_auth_policy + skip_kms_s2s_auth_policy = local.env.skip_kms_s2s_auth_policy + skip_iam_block_storage_authorization_policy = local.env.skip_iam_block_storage_authorization_policy + ibmcloud_api_key = local.env.ibmcloud_api_key + afm_instances = local.env.afm_instances + afm_cos_config = local.env.afm_cos_config + enable_ldap = local.env.enable_ldap + ldap_basedns = local.env.ldap_basedns + ldap_admin_password = local.env.ldap_admin_password + ldap_user_name = local.env.ldap_user_name + ldap_user_password = local.env.ldap_user_password + ldap_server = local.env.ldap_server + ldap_server_cert = local.env.ldap_server_cert + ldap_instance = local.env.ldap_instance + scale_encryption_enabled = local.env.scale_encryption_enabled + scale_encryption_type = local.env.scale_encryption_type + gklm_instance_key_pair = local.env.gklm_instance_key_pair + gklm_instances = local.env.gklm_instances + storage_type = local.env.storage_type + scale_encryption_admin_password = local.env.scale_encryption_admin_password + filesystem_config = local.env.filesystem_config + existing_bastion_instance_name = local.env.existing_bastion_instance_name + existing_bastion_instance_public_ip = local.env.existing_bastion_instance_public_ip + existing_bastion_security_group_id = local.env.existing_bastion_security_group_id + existing_bastion_ssh_private_key = local.env.existing_bastion_ssh_private_key + client_subnets_cidr = [local.env.client_subnets_cidr] + # hpcs_instance_name = local.env.hpcs_instance_name + # scale_encryption_admin_username = local.env.scale_encryption_admin_username + # scale_encryption_admin_default_password = local.env.scale_encryption_admin_default_password +} diff --git a/solutions/scale/outputs.tf b/solutions/scale/outputs.tf new file mode 100644 index 00000000..96a6c69f --- /dev/null +++ b/solutions/scale/outputs.tf @@ -0,0 +1,4 @@ +# output "scale" { +# description = "Scale details" +# value = module.scale +# } diff --git a/solutions/scale/override.json b/solutions/scale/override.json new file mode 100644 index 00000000..894e99b2 --- /dev/null +++ b/solutions/scale/override.json @@ -0,0 +1,122 @@ +{ + "cluster_prefix": "scale", + "existing_resource_group": "Default", + "vpc_name": null, + "vpc_cidr": "10.0.0.0/8", + "placement_strategy": null, + "ssh_keys": null, + "enable_bastion": true, + "enable_deployer": true, + "deployer_instance_profile": "mx2-4x32", + "vpc_cluster_login_private_subnets_cidr_blocks": "10.0.0.0/24", + "client_subnets_cidr": "10.10.10.0/24", + "client_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "vpc_cluster_private_subnets_cidr_blocks": "10.10.20.0/24", + "compute_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "storage_subnets_cidr": "10.10.30.0/24", + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 10, + "image": "ibm-redhat-8-10-minimal-amd64-2", + "filesystem": "fs1" + }, + { + "profile": "cx2-2x4", + "count": 10, + "image": "ibm-redhat-8-10-minimal-amd64-2", + "filesystem": "fs2" + } + ], + "storage_servers": [ + { + "profile": "cx2d-metal-96x192", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": "10.10.40.0/24", + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "colocate_protocol_instances": true, + "storage_gui_username": "admin", + "filesystem_config": [ + { + "filesystem": "fs1", + "block_size": "4M", + "default_data_replica": 2, + "default_metadata_replica": 2, + "max_data_replica": 3, + "max_metadata_replica": 3, + "mount_point": "/ibm/fs1" + } + ], + "filesets_config": [ + { + "fileset": "fileset1", + "filesystem": "fs1", + "junction_path": "/ibm/fs1/fileset1", + "client_mount_path": "/mnt", + "quota": 100 + }, + { + "fileset": "fileset2", + "filesystem": "fs1", + "junction_path": "/ibm/fs1/fileset1", + "client_mount_path": "/mnt", + "quota": 0 + } + ], + "afm_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "afm_cos_config": [ + { + "afm_fileset": "afm_fileset", + "mode": "iw", + "cos_instance": null, + "bucket_name": null, + "bucket_region": "us-south", + "cos_service_cred_key": "", + "bucket_storage_class": "smart", + "bucket_type": "region_location" + } + ], + "nsd_details": null, + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_names": { + "compute": "comp.com", + "storage": "strg.com", + "protocol": "ces.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_atracker": true, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "hpcs_instance_name": null, + "clusters": null +} diff --git a/solutions/scale/variables.tf b/solutions/scale/variables.tf new file mode 100644 index 00000000..dfa93ff8 --- /dev/null +++ b/solutions/scale/variables.tf @@ -0,0 +1,761 @@ +############################################################################## +# Offering Variations +############################################################################## +variable "ibm_customer_number" { + type = string + sensitive = true + default = null + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." + validation { + condition = ( + var.ibm_customer_number == null || + can(regex("^[0-9A-Za-z]+(,[0-9A-Za-z]+)*$", var.ibm_customer_number)) + ) + error_message = "The IBM customer number input value cannot have special characters." + } +} + +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "cluster_prefix" { + type = string + default = "scale" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" + +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## +# variable "enable_bastion" { +# type = bool +# default = true +# description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false." +# } + +variable "bastion_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "ibm-ubuntu-22-04-5-minimal-amd64-3" + profile = "cx2-4x8" + } + description = "Configuration for the Bastion node, including the image and instance profile. Only Ubuntu stock images are supported." +} + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Deployer Variables +############################################################################## + +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "jay-lsf-new-image" + profile = "mx2-4x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets_cidr" { + type = string + default = "10.241.50.0/24" + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 3 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Total Number of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Scale Variables +############################################################################## +variable "storage_subnets_cidr" { + type = string + default = "10.241.30.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "storage_servers" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "cx2d-metal-96x192" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/gpfs/fs1" + }] + description = "Number of BareMetal Servers to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = string + default = "10.241.40.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for protocol hosts." +} + +variable "colocate_protocol_instances" { + type = bool + default = true + description = "Enable it to use storage instances as protocol instances" +} + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "filesystem_config" { + type = list(object({ + filesystem = string + block_size = string + default_data_replica = number + default_metadata_replica = number + max_data_replica = number + max_metadata_replica = number + mount_point = string + })) + default = [{ + filesystem = "fs1" + block_size = "4M" + default_data_replica = 2 + default_metadata_replica = 2 + max_data_replica = 3 + max_metadata_replica = 3 + mount_point = "/ibm/fs1" + }] + description = "File system configurations." +} + +# variable "filesets_config" { +# type = list(object({ +# fileset = string +# filesystem = string +# junction_path = string +# client_mount_path = string +# quota = number +# })) +# default = [{ +# fileset = "fileset1" +# filesystem = "fs1" +# junction_path = "/ibm/fs1/fileset1" +# client_mount_path = "/mnt" +# quota = 100 +# }] +# description = "Fileset configurations." +# } + +variable "afm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for afm hosts." +} + +variable "afm_cos_config" { + type = list(object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + })) + default = [{ + afm_fileset = "afm_fileset" + mode = "iw" + cos_instance = "" + bucket_name = "" + bucket_region = "us-south" + cos_service_cred_key = "" + bucket_storage_class = "smart" + bucket_type = "region_location" + }] + # default = [{ + # afm_fileset = "afm_fileset" + # mode = "iw" + # cos_instance = null + # bucket_name = null + # bucket_region = "us-south" + # cos_service_cred_key = "" + # bucket_storage_class = "smart" + # bucket_type = "region_location" + # }] + description = "AFM configurations." +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + client = string + gklm = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "hpcs_instance_name" { + type = string + default = null + description = "Hyper Protect Crypto Service instance" +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_atracker" { + type = bool + default = true + description = "Enable Activity tracker" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "Enable Activity tracker" +} + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} + +############################################################################# +# LDAP variables +############################################################################## +variable "enable_ldap" { + type = bool + default = false + description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." +} + +variable "ldap_basedns" { + type = string + default = "ldapscale.com" + description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." +} + +variable "ldap_server" { + type = string + default = null + description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." +} + +variable "ldap_server_cert" { + type = string + sensitive = true + default = null + description = "Provide the existing LDAP server certificate. This value is required if the 'ldap_server' variable is not set to null. If the certificate is not provided or is invalid, the LDAP configuration may fail." +} + +variable "ldap_admin_password" { + type = string + sensitive = true + default = null + description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required. It is important to avoid including the username in the password for enhanced security." +} + +variable "ldap_user_name" { + type = string + default = "" + description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server]" +} + +variable "ldap_user_password" { + type = string + sensitive = true + default = "" + description = "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." +} + +# variable "ldap_instance_key_pair" { +# type = list(string) +# default = null +# description = "Name of the SSH key configured in your IBM Cloud account that is used to establish a connection to the LDAP Server. Make sure that the SSH key is present in the same resource group and region where the LDAP Servers are provisioned. If you do not have an SSH key in your IBM Cloud account, create one by using the [SSH keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys) instructions." +# } + +variable "ldap_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + image = "ibm-ubuntu-22-04-5-minimal-amd64-1" + }] + description = "Profile and Image name to be used for provisioning the LDAP instances. Note: Debian based OS are only supported for the LDAP feature" +} + +############################################################################## +# GKLM variables +############################################################################## +variable "scale_encryption_enabled" { + type = bool + default = false + description = "To enable the encryption for the filesystem. Select true or false" +} + +variable "scale_encryption_type" { + type = string + default = null + description = "To enable filesystem encryption, specify either 'key_protect' or 'gklm'. If neither is specified, the default value will be 'null' and encryption is disabled" +} + +variable "gklm_instance_key_pair" { + type = list(string) + default = null + description = "The key pair to use to launch the GKLM host." +} + +variable "gklm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." +} + +variable "scale_encryption_admin_default_password" { + type = string + default = null + description = "The default administrator password used for resetting the admin password based on the user input. The password has to be updated which was configured during the GKLM installation." +} + +variable "scale_encryption_admin_username" { + type = string + default = "SKLMAdmin" + description = "The default Admin username for Security Key Lifecycle Manager(GKLM)." +} + +variable "scale_encryption_admin_password" { + type = string + default = null + description = "Password that is used for performing administrative operations for the GKLM.The password must contain at least 8 characters and at most 20 characters. For a strong password, at least three alphabetic characters are required, with at least one uppercase and one lowercase letter. Two numbers, and at least one special character from this(~@_+:). Make sure that the password doesn't include the username. Visit this [page](https://www.ibm.com/docs/en/gklm/3.0.1?topic=roles-password-policy) to know more about password policy of GKLM. " +} + +variable "storage_type" { + type = string + default = "scratch" + description = "Select the required storage type(scratch/persistent/evaluation)." +} + +# variable "custom_file_shares" { +# type = list( +# object({ +# mount_path = string, +# size = number, +# iops = number +# }) +# ) +# default = [{ +# mount_path = "/mnt/binaries" +# size = 100 +# iops = 1000 +# }, { +# mount_path = "/mnt/data" +# size = 100 +# iops = 1000 +# }] +# description = "Custom file shares to access shared storage" +# } + +############################################################################## +# Observability Variables +############################################################################## + +variable "observability_atracker_enable" { + type = bool + default = true + description = "Activity Tracker Event Routing to configure how to route auditing events. While multiple Activity Tracker instances can be created, only one tracker is needed to capture all events. Creating additional trackers is unnecessary if an existing Activity Tracker is already integrated with a COS bucket. In such cases, set the value to false, as all events can be monitored and accessed through the existing Activity Tracker." +} + +variable "observability_atracker_target_type" { + type = string + default = "cloudlogs" + description = "All the events will be stored in either COS bucket or Cloud Logs on the basis of user input, so customers can retrieve or ingest them in their system." + validation { + condition = contains(["cloudlogs", "cos"], var.observability_atracker_target_type) + error_message = "Allowed values for atracker target type is cloudlogs and cos." + } +} + +variable "observability_monitoring_enable" { + description = "Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure and LSF application metrics from Management Nodes will be ingested." + type = bool + default = true +} + +variable "observability_logs_enable_for_management" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested." + type = bool + default = false +} + +variable "observability_logs_enable_for_compute" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested." + type = bool + default = false +} + +variable "observability_enable_platform_logs" { + description = "Setting this to true will create a tenant in the same region that the Cloud Logs instance is provisioned to enable platform logs for that region. NOTE: You can only have 1 tenant per region in an account." + type = bool + default = false +} + +variable "observability_enable_metrics_routing" { + description = "Enable metrics routing to manage metrics at the account-level by configuring targets and routes that define where data points are routed." + type = bool + default = false +} + +variable "observability_logs_retention_period" { + description = "The number of days IBM Cloud Logs will retain the logs data in Priority insights. Allowed values: 7, 14, 30, 60, 90." + type = number + default = 7 + validation { + condition = contains([7, 14, 30, 60, 90], var.observability_logs_retention_period) + error_message = "Allowed values for cloud logs retention period is 7, 14, 30, 60, 90." + } +} + +variable "observability_monitoring_on_compute_nodes_enable" { + description = "Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure metrics from Compute Nodes will be ingested." + type = bool + default = false +} + +variable "observability_monitoring_plan" { + description = "Type of service plan for IBM Cloud Monitoring instance. You can choose one of the following: lite, graduated-tier. For all details visit [IBM Cloud Monitoring Service Plans](https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans)." + type = string + default = "graduated-tier" + validation { + condition = can(regex("lite|graduated-tier", var.observability_monitoring_plan)) + error_message = "Please enter a valid plan for IBM Cloud Monitoring, for all details visit https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans." + } +} + +variable "skip_flowlogs_s2s_auth_policy" { + type = bool + default = false + description = "Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account." +} + +variable "skip_kms_s2s_auth_policy" { + type = bool + default = false + description = "Skip auth policy between KMS service and COS instance, set to true if this policy is already in place on account." +} + +variable "skip_iam_block_storage_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the block storage volume. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." +} + +########################################################################### +# Existing Bastion Support variables +########################################################################### + +variable "existing_bastion_instance_name" { + type = string + default = null + description = "Provide the name of the bastion instance. If none given then new bastion will be created." +} + +variable "existing_bastion_instance_public_ip" { + type = string + default = null + description = "Provide the public ip address of the bastion instance to establish the remote connection." +} + +variable "existing_bastion_security_group_id" { + type = string + default = null + description = "Specify the security group ID for the bastion server. This ID will be added as an allowlist rule on the HPC cluster nodes to facilitate secure SSH connections through the bastion node. By restricting access through a bastion server, this setup enhances security by controlling and monitoring entry points into the cluster environment. Ensure that the specified security group is correctly configured to permit only authorized traffic for secure and efficient management of cluster resources." +} + +variable "existing_bastion_ssh_private_key" { + type = string + sensitive = true + default = null + description = "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." +} diff --git a/solutions/scale/version.tf b/solutions/scale/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/scale/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/solutions/slurm/README.md b/solutions/slurm/README.md new file mode 100644 index 00000000..a299f162 --- /dev/null +++ b/solutions/slurm/README.md @@ -0,0 +1,79 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [ibm](#requirement\_ibm) | >= 1.68.1, < 2.0.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [slurm](#module\_slurm) | ./../.. | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` | n/a | yes | +| [bastion\_ssh\_keys](#input\_bastion\_ssh\_keys) | The key pair to use to access the bastion host. | `list(string)` | `null` | no | +| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `string` | `"10.0.0.0/24"` | no | +| [client\_instances](#input\_client\_instances) | Number of instances to be launched for client. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to launch the client host. | `list(string)` | `null` | no | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `string` | `"10.10.10.0/24"` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | `null` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `string` | `"10.10.20.0/24"` | no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [deployer\_instance\_profile](#input\_deployer\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"mx2-4x32"` | no | +| [dns\_custom\_resolver\_id](#input\_dns\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
storage = string
protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | +| [dns\_instance\_id](#input\_dns\_instance\_id) | IBM Cloud HPC DNS service instance id. | `string` | `null` | no | +| [dynamic\_compute\_instances](#input\_dynamic\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1024,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_bastion](#input\_enable\_bastion) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | Deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [existing\_resource\_group](#input\_existing\_resource\_group) | String describing resource groups to create or reference | `string` | `"Default"` | no | +| [file\_shares](#input\_file\_shares) | Custom file shares to access shared storage |
list(
object({
mount_path = string,
size = number,
iops = number
})
)
|
[
{
"iops": 1000,
"mount_path": "/mnt/binaries",
"size": 100
},
{
"iops": 1000,
"mount_path": "/mnt/data",
"size": 100
}
]
| no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | n/a | yes | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | n/a | yes | +| [key\_management](#input\_key\_management) | Set the value as key\_protect to enable customer managed encryption for boot volume and file share. If the key\_management is set as null, IBM Cloud resources will be always be encrypted through provider managed. | `string` | `"key_protect"` | no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [override](#input\_override) | Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment. | `bool` | `false` | no | +| [override\_json\_string](#input\_override\_json\_string) | Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes. | `string` | `null` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | +| [cluster_prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | `"lsf"` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.40.0/24"` | no | +| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the HPC cluster. | `list(string)` | `null` | no | +| [static\_compute\_instances](#input\_static\_compute\_instances) | Min Number of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. |
list(
object({
profile = string
count = number
image = string
filesystem_name = optional(string)
})
)
|
[
{
"count": 2,
"filesystem_name": "fs1",
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | `null` | no | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.30.0/24"` | no | +| [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | +| [zone](#input\_zone) | Zone where VPC will be created. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [slurm](#output\_slurm) | Slurm details | diff --git a/solutions/slurm/catalogValidationValues.json.template b/solutions/slurm/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/slurm/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/slurm/locals.tf b/solutions/slurm/locals.tf new file mode 100644 index 00000000..0d529799 --- /dev/null +++ b/solutions/slurm/locals.tf @@ -0,0 +1,96 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + deployer_instance = var.deployer_instance + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + compute_gui_password = var.compute_gui_password + compute_gui_username = var.compute_gui_username + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_names = var.dns_domain_names + dynamic_compute_instances = var.dynamic_compute_instances + enable_atracker = var.enable_atracker + # enable_bastion = var.enable_bastion + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + custom_file_shares = var.custom_file_shares + hpcs_instance_name = var.hpcs_instance_name + key_management = var.key_management + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + management_instances = var.management_instances + vpc_cidr = var.vpc_cidr + placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + static_compute_instances = var.static_compute_instances + storage_gui_password = var.storage_gui_password + storage_gui_username = var.storage_gui_username + storage_instances = var.storage_instances + storage_subnets_cidr = var.storage_subnets_cidr + vpc_name = var.vpc_name + } +} + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + compute_gui_password = lookup(local.override[local.override_type], "compute_gui_password", local.config.compute_gui_password) + compute_gui_username = lookup(local.override[local.override_type], "compute_gui_username", local.config.compute_gui_username) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_names = lookup(local.override[local.override_type], "dns_domain_names", local.config.dns_domain_names) + dynamic_compute_instances = lookup(local.override[local.override_type], "dynamic_compute_instances", local.config.dynamic_compute_instances) + enable_atracker = lookup(local.override[local.override_type], "enable_atracker", local.config.enable_atracker) + # enable_bastion = lookup(local.override[local.override_type], "enable_bastion", local.config.enable_bastion) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + custom_file_shares = lookup(local.override[local.override_type], "custom_file_shares", local.config.custom_file_shares) + hpcs_instance_name = lookup(local.override[local.override_type], "hpcs_instance_name", local.config.hpcs_instance_name) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + management_instances = lookup(local.override[local.override_type], "management_instances", local.config.management_instances) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + protocol_instances = lookup(local.override[local.override_type], "protocol_instances", local.config.protocol_instances) + protocol_subnets_cidr = lookup(local.override[local.override_type], "protocol_subnets_cidr", local.config.protocol_subnets_cidr) + static_compute_instances = lookup(local.override[local.override_type], "static_compute_instances", local.config.static_compute_instances) + storage_gui_password = lookup(local.override[local.override_type], "storage_gui_password", local.config.storage_gui_password) + storage_gui_username = lookup(local.override[local.override_type], "storage_gui_username", local.config.storage_gui_username) + storage_instances = lookup(local.override[local.override_type], "storage_instances", local.config.storage_instances) + storage_subnets_cidr = lookup(local.override[local.override_type], "storage_subnets_cidr", local.config.storage_subnets_cidr) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + } +} diff --git a/solutions/slurm/main.tf b/solutions/slurm/main.tf new file mode 100644 index 00000000..511598e3 --- /dev/null +++ b/solutions/slurm/main.tf @@ -0,0 +1,41 @@ +module "slurm" { + source = "./../.." + scheduler = "Slurm" + ibm_customer_number = var.ibm_customer_number + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + deployer_instance = local.env.deployer_instance + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_names + dynamic_compute_instances = local.env.dynamic_compute_instances + enable_atracker = local.env.enable_atracker + # enable_bastion = local.env.enable_bastion + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + custom_file_shares = local.env.custom_file_shares + key_management = local.env.key_management + client_instances = local.env.client_instances + management_instances = local.env.management_instances + vpc_cidr = local.env.vpc_cidr + placement_strategy = local.env.placement_strategy + protocol_instances = local.env.protocol_instances + protocol_subnets_cidr = [local.env.protocol_subnets_cidr] + static_compute_instances = local.env.static_compute_instances + storage_instances = local.env.storage_instances + storage_subnets_cidr = [local.env.storage_subnets_cidr] + vpc_name = local.env.vpc_name + + # compute_gui_password = local.env.compute_gui_password + # compute_gui_username = local.env.compute_gui_username + # hpcs_instance_name = local.env.hpcs_instance_name + # client_subnets_cidr = [local.env.client_subnets_cidr] + # storage_gui_password = local.env.storage_gui_password + # storage_gui_username = local.env.storage_gui_username +} diff --git a/solutions/slurm/outputs.tf b/solutions/slurm/outputs.tf new file mode 100644 index 00000000..ae50c7ab --- /dev/null +++ b/solutions/slurm/outputs.tf @@ -0,0 +1,4 @@ +output "slurm" { + description = "Slurm details" + value = module.slurm +} diff --git a/solutions/slurm/override.json b/solutions/slurm/override.json new file mode 100644 index 00000000..c3a21b3e --- /dev/null +++ b/solutions/slurm/override.json @@ -0,0 +1,85 @@ +{ + "cluster_prefix": "slurm", + "existing_resource_group": "Default", + "vpc_name": null, + "vpc_cidr": "10.0.0.0/8", + "placement_strategy": null, + "ssh_keys": null, + "enable_bastion": true, + "enable_deployer": false, + "deployer_instance_profile": "mx2-4x32", + "vpc_cluster_login_private_subnets_cidr_blocks": "10.0.0.0/24", + "client_subnets_cidr": "10.10.10.0/24", + "client_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "vpc_cluster_private_subnets_cidr_blocks": "10.10.20.0/24", + "management_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "static_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 0, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "dynamic_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 5000, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "storage_subnets_cidr": "10.10.30.0/24", + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": "10.10.40.0/24", + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "storage_gui_username": "admin", + "custom_file_shares": [ + { + "mount_path": "/mnt/binaries", + "size": 100, + "iops": 1000 + }, + { + "mount_path": "/mnt/data", + "size": 100, + "iops": 1000 + } + ], + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_names": { + "compute": "comp.com", + "storage": "strg.com", + "protocol": "ces.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_atracker": true, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "hpcs_instance_name": null +} diff --git a/solutions/slurm/variables.tf b/solutions/slurm/variables.tf new file mode 100644 index 00000000..852efaaa --- /dev/null +++ b/solutions/slurm/variables.tf @@ -0,0 +1,405 @@ +############################################################################## +# Offering Variations +############################################################################## +variable "ibm_customer_number" { + type = string + sensitive = true + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." + validation { + condition = can(regex("^[0-9A-Za-z]*([0-9A-Za-z]+,[0-9A-Za-z]+)*$", var.ibm_customer_number)) + error_message = "The IBM customer number input value cannot have special characters." + } +} + +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "cluster_prefix" { + type = string + default = "lsf" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" + +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +# variable "enable_bastion" { +# type = bool +# default = true +# description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false." +# } + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets_cidr" { + type = string + default = "10.241.50.0/24" + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for client." +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1024 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Scale Variables +############################################################################## +variable "storage_subnets_cidr" { + type = string + default = "10.241.30.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = string + default = "10.241.40.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for protocol hosts." +} + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "custom_file_shares" { + type = list( + object({ + mount_path = string, + size = number, + iops = number + }) + ) + default = [{ + mount_path = "/mnt/binaries" + size = 100 + iops = 1000 + }, { + mount_path = "/mnt/data" + size = 100 + iops = 1000 + }] + description = "Custom file shares to access shared storage" +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + client = string + gklm = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "hpcs_instance_name" { + type = string + default = null + description = "Hyper Protect Crypto Service instance" +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_atracker" { + type = bool + default = true + description = "Enable Activity tracker" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "Enable Activity tracker" +} + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} diff --git a/solutions/slurm/version.tf b/solutions/slurm/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/slurm/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/solutions/symphony/README.md b/solutions/symphony/README.md new file mode 100644 index 00000000..825806e3 --- /dev/null +++ b/solutions/symphony/README.md @@ -0,0 +1,79 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [ibm](#requirement\_ibm) | >= 1.68.1, < 2.0.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [symphony](#module\_symphony) | ./../.. | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [allowed\_cidr](#input\_allowed\_cidr) | Network CIDR to access the VPC. This is used to manage network ACL rules for accessing the cluster. | `list(string)` | n/a | yes | +| [bastion\_ssh\_keys](#input\_bastion\_ssh\_keys) | The key pair to use to access the bastion host. | `list(string)` | `null` | no | +| [bastion\_subnets\_cidr](#input\_bastion\_subnets\_cidr) | Subnet CIDR block to launch the bastion host. | `string` | `"10.0.0.0/24"` | no | +| [client\_instances](#input\_client\_instances) | Number of instances to be launched for client. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [client\_ssh\_keys](#input\_client\_ssh\_keys) | The key pair to use to launch the client host. | `list(string)` | `null` | no | +| [client\_subnets\_cidr](#input\_client\_subnets\_cidr) | Subnet CIDR block to launch the client host. | `string` | `"10.10.10.0/24"` | no | +| [compute\_gui\_password](#input\_compute\_gui\_password) | Password for compute cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [compute\_gui\_username](#input\_compute\_gui\_username) | GUI user to perform system management and monitoring tasks on compute cluster. | `string` | `"admin"` | no | +| [compute\_ssh\_keys](#input\_compute\_ssh\_keys) | The key pair to use to launch the compute host. | `list(string)` | `null` | no | +| [compute\_subnets\_cidr](#input\_compute\_subnets\_cidr) | Subnet CIDR block to launch the compute cluster host. | `string` | `"10.10.20.0/24"` | no | +| [cos\_instance\_name](#input\_cos\_instance\_name) | Exiting COS instance name | `string` | `null` | no | +| [deployer\_instance\_profile](#input\_deployer\_instance\_profile) | Deployer should be only used for better deployment performance | `string` | `"mx2-4x32"` | no | +| [dns\_custom\_resolver\_id](#input\_dns\_custom\_resolver\_id) | IBM Cloud DNS custom resolver id. | `string` | `null` | no | +| [dns\_domain\_names](#input\_dns\_domain\_names) | IBM Cloud HPC DNS domain names. |
object({
compute = string
storage = string
protocol = string
})
|
{
"compute": "comp.com",
"protocol": "ces.com",
"storage": "strg.com"
}
| no | +| [dns\_instance\_id](#input\_dns\_instance\_id) | IBM Cloud HPC DNS service instance id. | `string` | `null` | no | +| [dynamic\_compute\_instances](#input\_dynamic\_compute\_instances) | MaxNumber of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1024,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [enable\_atracker](#input\_enable\_atracker) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_bastion](#input\_enable\_bastion) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false. | `bool` | `true` | no | +| [enable\_cos\_integration](#input\_enable\_cos\_integration) | Integrate COS with HPC solution | `bool` | `true` | no | +| [enable\_deployer](#input\_enable\_deployer) | Deployer should be only used for better deployment performance | `bool` | `false` | no | +| [enable\_vpc\_flow\_logs](#input\_enable\_vpc\_flow\_logs) | Enable Activity tracker | `bool` | `true` | no | +| [enable\_vpn](#input\_enable\_vpn) | The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN, set this value to true. | `bool` | `false` | no | +| [existing\_resource\_group](#input\_existing\_resource\_group) | String describing resource groups to create or reference | `string` | `"Default"` | no | +| [file\_shares](#input\_file\_shares) | Custom file shares to access shared storage |
list(
object({
mount_path = string,
size = number,
iops = number
})
)
|
[
{
"iops": 1000,
"mount_path": "/mnt/binaries",
"size": 100
},
{
"iops": 1000,
"mount_path": "/mnt/data",
"size": 100
}
]
| no | +| [hpcs\_instance\_name](#input\_hpcs\_instance\_name) | Hyper Protect Crypto Service instance | `string` | `null` | no | +| [ibm\_customer\_number](#input\_ibm\_customer\_number) | Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn). | `string` | n/a | yes | +| [ibmcloud\_api\_key](#input\_ibmcloud\_api\_key) | IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required. | `string` | n/a | yes | +| [key\_management](#input\_key\_management) | Set the value as key\_protect to enable customer managed encryption for boot volume and file share. If the key\_management is set as null, IBM Cloud resources will be always be encrypted through provider managed. | `string` | `"key_protect"` | no | +| [management\_instances](#input\_management\_instances) | Number of instances to be launched for management. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [network\_cidr](#input\_network\_cidr) | Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning. | `string` | `"10.0.0.0/8"` | no | +| [override](#input\_override) | Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment. | `bool` | `false` | no | +| [override\_json\_string](#input\_override\_json\_string) | Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes. | `string` | `null` | no | +| [placement\_strategy](#input\_placement\_strategy) | VPC placement groups to create (null / host\_spread / power\_spread) | `string` | `null` | no | +| [cluster_prefix](#input\_prefix) | A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters. | `string` | `"lsf"` | no | +| [protocol\_instances](#input\_protocol\_instances) | Number of instances to be launched for protocol hosts. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 2,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [protocol\_subnets\_cidr](#input\_protocol\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.40.0/24"` | no | +| [ssh\_keys](#input\_ssh\_keys) | The key pair to use to access the HPC cluster. | `list(string)` | `null` | no | +| [static\_compute\_instances](#input\_static\_compute\_instances) | Min Number of instances to be launched for compute cluster. |
list(
object({
profile = string
count = number
image = string
})
)
|
[
{
"count": 1,
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "cx2-2x4"
}
]
| no | +| [storage\_gui\_password](#input\_storage\_gui\_password) | Password for storage cluster GUI | `string` | `"hpc@IBMCloud"` | no | +| [storage\_gui\_username](#input\_storage\_gui\_username) | GUI user to perform system management and monitoring tasks on storage cluster. | `string` | `"admin"` | no | +| [storage\_instances](#input\_storage\_instances) | Number of instances to be launched for storage cluster. |
list(
object({
profile = string
count = number
image = string
filesystem_name = optional(string)
})
)
|
[
{
"count": 2,
"filesystem_name": "fs1",
"image": "ibm-redhat-8-10-minimal-amd64-2",
"profile": "bx2-2x8"
}
]
| no | +| [storage\_ssh\_keys](#input\_storage\_ssh\_keys) | The key pair to use to launch the storage cluster host. | `list(string)` | `null` | no | +| [storage\_subnets\_cidr](#input\_storage\_subnets\_cidr) | Subnet CIDR block to launch the storage cluster host. | `string` | `"10.10.30.0/24"` | no | +| [vpc](#input\_vpc) | Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc) | `string` | `null` | no | +| [vpn\_peer\_address](#input\_vpn\_peer\_address) | The peer public IP address to which the VPN will be connected. | `string` | `null` | no | +| [vpn\_peer\_cidr](#input\_vpn\_peer\_cidr) | The peer CIDRs (e.g., 192.168.0.0/24) to which the VPN will be connected. | `list(string)` | `null` | no | +| [vpn\_preshared\_key](#input\_vpn\_preshared\_key) | The pre-shared key for the VPN. | `string` | `null` | no | +| [zone](#input\_zone) | Zone where VPC will be created. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [symphony](#output\_symphony) | Symphony details | diff --git a/solutions/symphony/catalogValidationValues.json.template b/solutions/symphony/catalogValidationValues.json.template new file mode 100644 index 00000000..bb5298d4 --- /dev/null +++ b/solutions/symphony/catalogValidationValues.json.template @@ -0,0 +1,7 @@ +{ + "ibmcloud_api_key": $VALIDATION_APIKEY, + "cluster_prefix": $PREFIX, + "zones": "[\"ca-tor-1\"]", + "existing_resource_group": "geretain-hpc-rg", + "ssh_keys": "[\"geretain-hpc-ssh-key\"]" +} diff --git a/solutions/symphony/locals.tf b/solutions/symphony/locals.tf new file mode 100644 index 00000000..9864f0a7 --- /dev/null +++ b/solutions/symphony/locals.tf @@ -0,0 +1,99 @@ +# locals needed for ibm provider +locals { + # Region and Zone calculations + region = join("-", slice(split("-", var.zones[0]), 0, 2)) +} + + +locals { + override_json_path = abspath("./override.json") + override = { + override = jsondecode(var.override && var.override_json_string == null ? + (local.override_json_path == "" ? file("${path.root}/override.json") : file(local.override_json_path)) + : + "{}") + override_json_string = jsondecode(var.override_json_string == null ? "{}" : var.override_json_string) + } + override_type = var.override_json_string == null ? "override" : "override_json_string" +} + + +locals { + config = { + existing_resource_group = var.existing_resource_group + remote_allowed_ips = var.remote_allowed_ips + deployer_instance = var.deployer_instance + ssh_keys = var.ssh_keys + vpc_cluster_login_private_subnets_cidr_blocks = var.vpc_cluster_login_private_subnets_cidr_blocks + compute_gui_password = var.compute_gui_password + compute_gui_username = var.compute_gui_username + vpc_cluster_private_subnets_cidr_blocks = var.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = var.cos_instance_name + dns_custom_resolver_id = var.dns_custom_resolver_id + dns_instance_id = var.dns_instance_id + dns_domain_names = var.dns_domain_names + dynamic_compute_instances = var.dynamic_compute_instances + enable_atracker = var.enable_atracker + # enable_bastion = var.enable_bastion + enable_cos_integration = var.enable_cos_integration + enable_vpc_flow_logs = var.enable_vpc_flow_logs + custom_file_shares = var.custom_file_shares + hpcs_instance_name = var.hpcs_instance_name + key_management = var.key_management + client_instances = var.client_instances + client_subnets_cidr = var.client_subnets_cidr + management_instances = var.management_instances + vpc_cidr = var.vpc_cidr + placement_strategy = var.placement_strategy + cluster_prefix = var.cluster_prefix + protocol_instances = var.protocol_instances + protocol_subnets_cidr = var.protocol_subnets_cidr + static_compute_instances = var.static_compute_instances + storage_gui_password = var.storage_gui_password + storage_gui_username = var.storage_gui_username + storage_instances = var.storage_instances + storage_subnets_cidr = var.storage_subnets_cidr + vpc_name = var.vpc_name + } +} + + +# Compile Environment for Config output +locals { + env = { + existing_resource_group = lookup(local.override[local.override_type], "existing_resource_group", local.config.existing_resource_group) + remote_allowed_ips = lookup(local.override[local.override_type], "remote_allowed_ips", local.config.remote_allowed_ips) + deployer_instance = lookup(local.override[local.override_type], "deployer_instance", local.config.deployer_instance) + ssh_keys = lookup(local.override[local.override_type], "ssh_keys", local.config.ssh_keys) + vpc_cluster_login_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_login_private_subnets_cidr_blocks", local.config.vpc_cluster_login_private_subnets_cidr_blocks) + compute_gui_password = lookup(local.override[local.override_type], "compute_gui_password", local.config.compute_gui_password) + compute_gui_username = lookup(local.override[local.override_type], "compute_gui_username", local.config.compute_gui_username) + vpc_cluster_private_subnets_cidr_blocks = lookup(local.override[local.override_type], "vpc_cluster_private_subnets_cidr_blocks", local.config.vpc_cluster_private_subnets_cidr_blocks) + cos_instance_name = lookup(local.override[local.override_type], "cos_instance_name", local.config.cos_instance_name) + dns_custom_resolver_id = lookup(local.override[local.override_type], "dns_custom_resolver_id", local.config.dns_custom_resolver_id) + dns_instance_id = lookup(local.override[local.override_type], "dns_instance_id", local.config.dns_instance_id) + dns_domain_names = lookup(local.override[local.override_type], "dns_domain_names", local.config.dns_domain_names) + dynamic_compute_instances = lookup(local.override[local.override_type], "dynamic_compute_instances", local.config.dynamic_compute_instances) + enable_atracker = lookup(local.override[local.override_type], "enable_atracker", local.config.enable_atracker) + # enable_bastion = lookup(local.override[local.override_type], "enable_bastion", local.config.enable_bastion) + enable_cos_integration = lookup(local.override[local.override_type], "enable_cos_integration", local.config.enable_cos_integration) + enable_vpc_flow_logs = lookup(local.override[local.override_type], "enable_vpc_flow_logs", local.config.enable_vpc_flow_logs) + custom_file_shares = lookup(local.override[local.override_type], "custom_file_shares", local.config.custom_file_shares) + hpcs_instance_name = lookup(local.override[local.override_type], "hpcs_instance_name", local.config.hpcs_instance_name) + key_management = lookup(local.override[local.override_type], "key_management", local.config.key_management) + client_instances = lookup(local.override[local.override_type], "client_instances", local.config.client_instances) + client_subnets_cidr = lookup(local.override[local.override_type], "client_subnets_cidr", local.config.client_subnets_cidr) + management_instances = lookup(local.override[local.override_type], "management_instances", local.config.management_instances) + vpc_cidr = lookup(local.override[local.override_type], "vpc_cidr", local.config.vpc_cidr) + placement_strategy = lookup(local.override[local.override_type], "placement_strategy", local.config.placement_strategy) + cluster_prefix = lookup(local.override[local.override_type], "cluster_prefix", local.config.cluster_prefix) + protocol_instances = lookup(local.override[local.override_type], "protocol_instances", local.config.protocol_instances) + protocol_subnets_cidr = lookup(local.override[local.override_type], "protocol_subnets_cidr", local.config.protocol_subnets_cidr) + static_compute_instances = lookup(local.override[local.override_type], "static_compute_instances", local.config.static_compute_instances) + storage_gui_password = lookup(local.override[local.override_type], "storage_gui_password", local.config.storage_gui_password) + storage_gui_username = lookup(local.override[local.override_type], "storage_gui_username", local.config.storage_gui_username) + storage_instances = lookup(local.override[local.override_type], "storage_instances", local.config.storage_instances) + storage_subnets_cidr = lookup(local.override[local.override_type], "storage_subnets_cidr", local.config.storage_subnets_cidr) + vpc_name = lookup(local.override[local.override_type], "vpc_name", local.config.vpc_name) + } +} diff --git a/solutions/symphony/main.tf b/solutions/symphony/main.tf new file mode 100644 index 00000000..05b75645 --- /dev/null +++ b/solutions/symphony/main.tf @@ -0,0 +1,41 @@ +module "symphony" { + source = "./../.." + scheduler = "Symphony" + ibm_customer_number = var.ibm_customer_number + zones = var.zones + remote_allowed_ips = var.remote_allowed_ips + cluster_prefix = local.env.cluster_prefix + ssh_keys = local.env.ssh_keys + existing_resource_group = local.env.existing_resource_group + deployer_instance = local.env.deployer_instance + vpc_cluster_login_private_subnets_cidr_blocks = local.env.vpc_cluster_login_private_subnets_cidr_blocks + vpc_cluster_private_subnets_cidr_blocks = local.env.vpc_cluster_private_subnets_cidr_blocks + cos_instance_name = local.env.cos_instance_name + dns_custom_resolver_id = local.env.dns_custom_resolver_id + dns_instance_id = local.env.dns_instance_id + dns_domain_names = local.env.dns_domain_names + dynamic_compute_instances = local.env.dynamic_compute_instances + enable_atracker = local.env.enable_atracker + # enable_bastion = local.env.enable_bastion + enable_cos_integration = local.env.enable_cos_integration + enable_vpc_flow_logs = local.env.enable_vpc_flow_logs + custom_file_shares = local.env.custom_file_shares + key_management = local.env.key_management + client_instances = local.env.client_instances + management_instances = local.env.management_instances + vpc_cidr = local.env.vpc_cidr + placement_strategy = local.env.placement_strategy + protocol_instances = local.env.protocol_instances + protocol_subnets_cidr = [local.env.protocol_subnets_cidr] + static_compute_instances = local.env.static_compute_instances + storage_instances = local.env.storage_instances + storage_subnets_cidr = [local.env.storage_subnets_cidr] + vpc_name = local.env.vpc_name + + # compute_gui_password = local.env.compute_gui_password + # compute_gui_username = local.env.compute_gui_username + # hpcs_instance_name = local.env.hpcs_instance_name + # client_subnets_cidr = [local.env.client_subnets_cidr] + # storage_gui_password = local.env.storage_gui_password + # storage_gui_username = local.env.storage_gui_username +} diff --git a/solutions/symphony/outputs.tf b/solutions/symphony/outputs.tf new file mode 100644 index 00000000..7affaab0 --- /dev/null +++ b/solutions/symphony/outputs.tf @@ -0,0 +1,4 @@ +output "symphony" { + description = "Symphony details" + value = module.symphony +} diff --git a/solutions/symphony/override.json b/solutions/symphony/override.json new file mode 100644 index 00000000..d87c9690 --- /dev/null +++ b/solutions/symphony/override.json @@ -0,0 +1,85 @@ +{ + "cluster_prefix": "symphony", + "existing_resource_group": "Default", + "vpc_name": null, + "vpc_cidr": "10.0.0.0/8", + "placement_strategy": null, + "ssh_keys": null, + "enable_bastion": true, + "enable_deployer": false, + "deployer_instance_profile": "mx2-4x32", + "vpc_cluster_login_private_subnets_cidr_blocks": "10.0.0.0/24", + "client_subnets_cidr": "10.10.10.0/24", + "client_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "vpc_cluster_private_subnets_cidr_blocks": "10.10.20.0/24", + "management_instances": [ + { + "profile": "cx2-2x4", + "count": 3, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "static_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 0, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "dynamic_compute_instances": [ + { + "profile": "cx2-2x4", + "count": 5000, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "compute_gui_username": "admin", + "storage_subnets_cidr": "10.10.30.0/24", + "storage_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "protocol_subnets_cidr": "10.10.40.0/24", + "protocol_instances": [ + { + "profile": "cx2-2x4", + "count": 2, + "image": "ibm-redhat-8-10-minimal-amd64-2" + } + ], + "storage_gui_username": "admin", + "custom_file_shares": [ + { + "mount_path": "/mnt/binaries", + "size": 100, + "iops": 1000 + }, + { + "mount_path": "/mnt/data", + "size": 100, + "iops": 1000 + } + ], + "dns_instance_id": null, + "dns_custom_resolver_id": null, + "dns_domain_names": { + "compute": "comp.com", + "storage": "strg.com", + "protocol": "ces.com" + }, + "enable_cos_integration": true, + "cos_instance_name": null, + "enable_atracker": true, + "enable_vpc_flow_logs": true, + "key_management": "key_protect", + "hpcs_instance_name": null +} diff --git a/solutions/symphony/variables.tf b/solutions/symphony/variables.tf new file mode 100644 index 00000000..852efaaa --- /dev/null +++ b/solutions/symphony/variables.tf @@ -0,0 +1,405 @@ +############################################################################## +# Offering Variations +############################################################################## +variable "ibm_customer_number" { + type = string + sensitive = true + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." + validation { + condition = can(regex("^[0-9A-Za-z]*([0-9A-Za-z]+,[0-9A-Za-z]+)*$", var.ibm_customer_number)) + error_message = "The IBM customer number input value cannot have special characters." + } +} + +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "cluster_prefix" { + type = string + default = "lsf" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +############################################################################## +# Resource Groups Variables +############################################################################## +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" + +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "hpc-lsf-fp15-deployer-rhel810-v1" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +# variable "enable_bastion" { +# type = bool +# default = true +# description = "The solution supports multiple ways to connect to your HPC cluster for example, using bastion node, via VPN or direct connection. If connecting to the HPC cluster via VPN or direct connection, set this value to false." +# } + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets_cidr" { + type = string + default = "10.241.50.0/24" + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for client." +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 1024 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Scale Variables +############################################################################## +variable "storage_subnets_cidr" { + type = string + default = "10.241.30.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "protocol_subnets_cidr" { + type = string + default = "10.241.40.0/24" + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-2" + }] + description = "Number of instances to be launched for protocol hosts." +} + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "custom_file_shares" { + type = list( + object({ + mount_path = string, + size = number, + iops = number + }) + ) + default = [{ + mount_path = "/mnt/binaries" + size = 100 + iops = 1000 + }, { + mount_path = "/mnt/data" + size = 100 + iops = 1000 + }] + description = "Custom file shares to access shared storage" +} + +############################################################################## +# DNS Variables +############################################################################## + +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = string + protocol = string + client = string + gklm = string + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = "key_protect" + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "hpcs_instance_name" { + type = string + default = null + description = "Hyper Protect Crypto Service instance" +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = true + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_atracker" { + type = bool + default = true + description = "Enable Activity tracker" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = true + description = "Enable Activity tracker" +} + +############################################################################## +# Override JSON +############################################################################## +variable "override" { + type = bool + default = false + description = "Override default values with custom JSON template. This uses the file `override.json` to allow users to create a fully customized environment." + +} + +variable "override_json_string" { + type = string + default = null + description = "Override default values with a JSON object. Any JSON other than an empty string overrides other configuration changes." +} diff --git a/solutions/symphony/version.tf b/solutions/symphony/version.tf new file mode 100644 index 00000000..93f82bed --- /dev/null +++ b/solutions/symphony/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + } +} + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = local.region +} diff --git a/tests/data/lsf_fp14_config.yml b/tests/data/lsf_fp14_config.yml new file mode 100644 index 00000000..83ea99b4 --- /dev/null +++ b/tests/data/lsf_fp14_config.yml @@ -0,0 +1,82 @@ +Scheduler: LSF +lsf_version: fixpack_14 +zones: eu-gb-1 +remote_allowed_ips: +cluster_name: HPC-LSF-1 +ssh_keys: geretain-hpc +default_existing_resource_group: HPCC +non_default_existing_resource_group: Default +dns_domain_name: + compute: comp.com +bastion_instance: + profile: cx2-4x8 + image: ibm-ubuntu-22-04-5-minimal-amd64-3 +deployer_instance: + profile: bx2-8x32 + image: hpc-lsf-fp14-deployer-rhel810-v1 +enable_cos_integration: false +enable_vpc_flow_logs: false +custom_file_shares: + - mount_path: /mnt/vpcstorage/tools + size: 100 + iops: 1000 + - mount_path: /mnt/vpcstorage/data + size: 100 + iops: 1000 +key_management: key_protect +login_instance: + - profile: bx2-2x8 + image: hpc-lsf-fp14-compute-rhel810-v1 +management_instances: + - profile: bx2d-4x16 + count: 2 + image: hpc-lsf-fp14-rhel810-v1 +static_compute_instances: + - profile: bx2-2x8 + count: 2 + image: hpc-lsf-fp14-compute-rhel810-v1 +dynamic_compute_instances: + - profile: bx2-2x8 + count: 1024 + image: hpc-lsf-fp14-compute-rhel810-v1 +placement_strategy: spread +kms_instance_name: cicd-lsf-dnd-kms-instance +kms_key_name: cicd-lsf-dnd-kms-key +app_center_gui_password: Pass@1234 # pragma: allowlist secret +observability_atracker_enable: true +observability_atracker_target_type: cloudlogs +observability_monitoring_enable: true +observability_monitoring_on_compute_nodes_enable: true +observability_logs_enable_for_management: true +observability_logs_enable_for_compute: true +observability_enable_platform_logs: true +observability_enable_metrics_routing: true +observability_logs_retention_period: 7 +observability_monitoring_plan: graduated-tier +sccwp_enable: true +cspm_enabled: true +sccwp_service_plan: graduated-tier +app_config_plan: standardv2 +enable_hyperthreading: true +enable_ldap: true +ldap_basedns: cicdldap.com +ldap_admin_password: Pass@123 # pragma: allowlist secret +ldap_user_name: tester +ldap_user_password: Pass@123 # pragma: allowlist secret +ldap_instance: + - profile: cx2-2x4 + image: ibm-ubuntu-22-04-5-minimal-amd64-1 +us_east_zone: us-east-3 +us_east_cluster_name: HPC-LSF-1 +eu_de_zone: eu-de-3 +eu_de_cluster_name: HPC-LSF-2 +us_south_zone: us-south-1 +us_south_cluster_name: HPC-LSF-2 +jp_tok_zone: jp-tok-1 +jp_tok_cluster_name: HPC-LSF-2 +attracker_test_zone: jp-tok-1 #added for testing purpose +management_instances_image: hpc-lsf-fp14-rhel810-v1 #added for testing purpose +static_compute_instances_image: hpc-lsf-fp14-compute-rhel810-v1 #added for testing purpose +dynamic_compute_instances_image: hpc-lsf-fp14-compute-rhel810-v1 #added for testing purpose +ssh_file_path: /artifacts/.ssh/id_rsa +ssh_file_path_two: /artifacts/.ssh/id_rsa diff --git a/tests/data/lsf_fp15_config.yml b/tests/data/lsf_fp15_config.yml new file mode 100644 index 00000000..e0b4085b --- /dev/null +++ b/tests/data/lsf_fp15_config.yml @@ -0,0 +1,83 @@ +Scheduler: LSF +lsf_version: fixpack_15 +zones: jp-tok-1 +remote_allowed_ips: +cluster_name: HPC-LSF-1 +ssh_keys: geretain-hpc +default_existing_resource_group: HPCC +non_default_existing_resource_group: Default +dns_domain_name: + compute: comp.com +bastion_instance: + profile: cx2-4x8 + image: ibm-ubuntu-22-04-5-minimal-amd64-3 +deployer_instance: + profile: bx2-8x32 + image: hpc-lsf-fp15-deployer-rhel810-v1 +enable_cos_integration: false +enable_vpc_flow_logs: false +custom_file_shares: + - mount_path: /mnt/vpcstorage/tools + size: 100 + iops: 1000 + - mount_path: /mnt/vpcstorage/data + size: 100 + iops: 1000 +key_management: key_protect +login_instance: + - profile: bx2-2x8 + image: hpc-lsf-fp15-compute-rhel810-v1 +management_instances: + - profile: bx2d-4x16 + count: 2 + image: hpc-lsf-fp15-rhel810-v1 +static_compute_instances: + - profile: bx2-2x8 + count: 2 + image: hpc-lsf-fp15-compute-rhel810-v1 +dynamic_compute_instances: + - profile: bx2-2x8 + count: 1024 + image: hpc-lsf-fp15-compute-rhel810-v1 +placement_strategy: spread +kms_instance_name: cicd-lsf-dnd-kms-instance +kms_key_name: cicd-lsf-dnd-kms-key +app_center_gui_password: Pass@1234 # pragma: allowlist secret +observability_atracker_enable: true +observability_atracker_target_type: cloudlogs +observability_monitoring_enable: true +observability_monitoring_on_compute_nodes_enable: true +observability_logs_enable_for_management: true +observability_logs_enable_for_compute: true +observability_enable_platform_logs: true +observability_enable_metrics_routing: true +observability_logs_retention_period: 7 +observability_monitoring_plan: graduated-tier +sccwp_enable: true +cspm_enabled: true +sccwp_service_plan: graduated-tier +app_config_plan: standardv2 +enable_hyperthreading: true +enable_ldap: true +ldap_basedns: cicdldap.com +ldap_admin_password: Pass@123 # pragma: allowlist secret +ldap_user_name: tester +ldap_user_password: Pass@123 # pragma: allowlist secret +ldap_instance: + - profile: cx2-2x4 + image: ibm-ubuntu-22-04-5-minimal-amd64-1 + count: 1 +us_east_zone: us-east-3 +us_east_cluster_name: HPC-LSF-1 +eu_de_zone: eu-de-3 +eu_de_cluster_name: HPC-LSF-2 +us_south_zone: us-south-1 +us_south_cluster_name: HPC-LSF-2 +jp_tok_zone: jp-tok-1 +jp_tok_cluster_name: HPC-LSF-2 +attracker_test_zone: eu-de-1 #added for testing purpose +management_instances_image: hpc-lsf-fp15-rhel810-v1 #added for testing purpose +static_compute_instances_image: hpc-lsf-fp15-compute-rhel810-v1 #added for testing purpose +dynamic_compute_instances_image: hpc-lsf-fp15-compute-rhel810-v1 #added for testing purpose +ssh_file_path: /artifacts/.ssh/id_rsa +ssh_file_path_two: /artifacts/.ssh/id_rsa diff --git a/tests/deployment/lsf_deployment.go b/tests/deployment/lsf_deployment.go new file mode 100644 index 00000000..046ae0de --- /dev/null +++ b/tests/deployment/lsf_deployment.go @@ -0,0 +1,349 @@ +package tests + +import ( + "encoding/json" + "fmt" + "log" + "os" + "strconv" + "strings" + + utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" + "gopkg.in/yaml.v3" +) + +// globalIP stores the public IP address +var globalIP string + +// ManagementNodeInstances represents each management node instance. +type ManagementNodeInstances struct { + Profile string `yaml:"profile" json:"profile"` + Count int `yaml:"count" json:"count"` + Image string `yaml:"image" json:"image"` +} + +// LoginNodeInstance represents login node instance. +type LoginNodeInstance struct { + Profile string `yaml:"profile" json:"profile"` + Image string `yaml:"image" json:"image"` +} + +// BastionInstance represents bastion node instance. +type BastionInstance struct { + Profile string `yaml:"profile" json:"profile"` + Image string `yaml:"image" json:"image"` +} + +// DeployerInstance represents deployer node instance. +type DeployerInstance struct { + Profile string `yaml:"profile" json:"profile"` + Image string `yaml:"image" json:"image"` +} + +// StaticWorkerInstances represents each static compute instance. +type StaticWorkerInstances struct { + Profile string `yaml:"profile" json:"profile"` + Count int `yaml:"count" json:"count"` + Image string `yaml:"image" json:"image"` +} + +// DynamicWorkerInstances represents each dynamic compute instance. +type DynamicWorkerInstances struct { + Profile string `yaml:"profile" json:"profile"` + Count int `yaml:"count" json:"count"` + Image string `yaml:"image" json:"image"` +} + +// LDAPServerNodeInstance represents each ldap node instance. +type LDAPServerNodeInstance struct { + Profile string `yaml:"profile" json:"profile"` + Image string `yaml:"image" json:"image"` + Count int `yaml:"count" json:"count"` +} + +// CustomFileShare represents custom file share configuration. +type CustomFileShare struct { + MountPath string `yaml:"mount_path" json:"mount_path"` + Size string `yaml:"size" json:"size"` + IOPS string `yaml:"iops" json:"iops"` +} + +// DnsDomainNames represents DNS configuration. +type DnsDomainName struct { + Compute string `yaml:"compute" json:"compute"` +} + +// Config represents the YAML configuration. +type Config struct { + BastionInstance BastionInstance `yaml:"bastion_instance"` + Scheduler string `yaml:"scheduler"` + DefaultExistingResourceGroup string `yaml:"default_existing_resource_group"` + NonDefaultExistingResourceGroup string `yaml:"non_default_existing_resource_group"` + Zones string `yaml:"zones"` + ClusterName string `yaml:"cluster_name"` + RemoteAllowedIPs string `yaml:"remote_allowed_ips"` + SSHKeys string `yaml:"ssh_keys"` + DeployerInstance DeployerInstance `yaml:"deployer_instance"` + EnableVPCFlowLogs bool `yaml:"enable_vpc_flow_logs"` + KeyManagement string `yaml:"key_management"` + KMSInstanceName string `yaml:"kms_instance_name"` + KMSKeyName string `yaml:"kms_key_name"` + EnableHyperthreading bool `yaml:"enable_hyperthreading"` + DnsDomainName DnsDomainName `yaml:"dns_domain_name"` + EnableLdap bool `yaml:"enable_ldap"` + LdapBaseDns string `yaml:"ldap_basedns"` + LdapAdminPassword string `yaml:"ldap_admin_password"` // pragma: allowlist secret + LdapUserName string `yaml:"ldap_user_name"` + LdapUserPassword string `yaml:"ldap_user_password"` // pragma: allowlist secret + LdapInstance []LDAPServerNodeInstance `yaml:"ldap_instance"` + USEastZone string `yaml:"us_east_zone"` + USEastClusterName string `yaml:"us_east_cluster_name"` + JPTokZone string `yaml:"jp_tok_zone"` + JPTokClusterName string `yaml:"jp_tok_cluster_name"` + EUDEZone string `yaml:"eu_de_zone"` + EUDEClusterName string `yaml:"eu_de_cluster_name"` + USSouthZone string `yaml:"us_south_zone"` + USSouthClusterName string `yaml:"us_south_cluster_name"` + SSHFilePath string `yaml:"ssh_file_path"` + SSHFilePathTwo string `yaml:"ssh_file_path_two"` + StaticComputeInstances []StaticWorkerInstances `yaml:"static_compute_instances"` + DynamicComputeInstances []DynamicWorkerInstances `yaml:"dynamic_compute_instances"` + SccWPEnabled bool `yaml:"sccwp_enable"` + CspmEnabled bool `yaml:"cspm_enabled"` + SccwpServicePlan string `yaml:"sccwp_service_plan"` + AppConfigPlan string `yaml:"app_config_plan"` + ObservabilityMonitoringEnable bool `yaml:"observability_monitoring_enable"` + ObservabilityMonitoringOnComputeNodesEnable bool `yaml:"observability_monitoring_on_compute_nodes_enable"` + ObservabilityAtrackerEnable bool `yaml:"observability_atracker_enable"` + ObservabilityAtrackerTargetType string `yaml:"observability_atracker_target_type"` + ObservabilityLogsEnableForManagement bool `yaml:"observability_logs_enable_for_management"` + ObservabilityLogsEnableForCompute bool `yaml:"observability_logs_enable_for_compute"` + ObservabilityEnablePlatformLogs bool `yaml:"observability_enable_platform_logs"` + ObservabilityEnableMetricsRouting bool `yaml:"observability_enable_metrics_routing"` + ObservabilityLogsRetentionPeriod int `yaml:"observability_logs_retention_period"` + ObservabilityMonitoringPlan string `yaml:"observability_monitoring_plan"` + EnableCosIntegration bool `yaml:"enable_cos_integration"` + CustomFileShares []CustomFileShare `yaml:"custom_file_shares"` + PlacementStrategy string `yaml:"placement_strategy"` + ManagementInstances []ManagementNodeInstances `yaml:"management_instances"` + ManagementInstancesImage string `yaml:"management_instances_image"` + StaticComputeInstancesImage string `yaml:"static_compute_instances_image"` + DynamicComputeInstancesImage string `yaml:"dynamic_compute_instances_image"` + AppCenterGuiPassword string `yaml:"app_center_gui_password"` // pragma: allowlist secret + LsfVersion string `yaml:"lsf_version"` + LoginInstance []LoginNodeInstance `yaml:"login_instance"` + AttrackerTestZone string `yaml:"attracker_test_zone"` +} + +// GetConfigFromYAML reads a YAML file and populates the Config struct. +func GetConfigFromYAML(filePath string) (*Config, error) { + file, err := os.Open(filePath) + if err != nil { + return nil, fmt.Errorf("failed to open YAML file %s: %w", filePath, err) + } + defer func() { + if closeErr := file.Close(); closeErr != nil { + log.Printf("Warning: failed to close file %s: %v", filePath, closeErr) + } + }() + + var config Config + if err := yaml.NewDecoder(file).Decode(&config); err != nil { + return nil, fmt.Errorf("failed to decode YAML from %s: %w", filePath, err) + } + + // Get the public IP + globalIP, err = utils.GetPublicIP() + if err != nil { + return nil, fmt.Errorf("failed to get public IP: %w", err) + } + + if err := setEnvFromConfig(&config); err != nil { + return nil, fmt.Errorf("failed to set environment variables: %w", err) + } + + return &config, nil +} + +// setEnvFromConfig sets environment variables based on the provided configuration. +func setEnvFromConfig(config *Config) error { + envVars := map[string]interface{}{ + "BASTION_INSTANCE": config.BastionInstance, + "DEFAULT_EXISTING_RESOURCE_GROUP": config.DefaultExistingResourceGroup, + "NON_DEFAULT_EXISTING_RESOURCE_GROUP": config.NonDefaultExistingResourceGroup, + "ZONES": config.Zones, + "CLUSTER_NAME": config.ClusterName, + "REMOTE_ALLOWED_IPS": config.RemoteAllowedIPs, + "SSH_KEYS": config.SSHKeys, + "DEPLOYER_INSTANCE": config.DeployerInstance, + "ENABLE_VPC_FLOW_LOGS": config.EnableVPCFlowLogs, + "KEY_MANAGEMENT": config.KeyManagement, + "KMS_INSTANCE_NAME": config.KMSInstanceName, + "KMS_KEY_NAME": config.KMSKeyName, + "ENABLE_HYPERTHREADING": config.EnableHyperthreading, + "DNS_DOMAIN_NAME": config.DnsDomainName, + "ENABLE_LDAP": config.EnableLdap, + "LDAP_BASEDNS": config.LdapBaseDns, + "LDAP_ADMIN_PASSWORD": config.LdapAdminPassword, // pragma: allowlist secret + "LDAP_USER_NAME": config.LdapUserName, + "LDAP_USER_PASSWORD": config.LdapUserPassword, // pragma: allowlist secret + "LDAP_INSTANCE": config.LdapInstance, + "US_EAST_ZONE": config.USEastZone, + "US_EAST_CLUSTER_NAME": config.USEastClusterName, + "EU_DE_ZONE": config.EUDEZone, + "EU_DE_CLUSTER_NAME": config.EUDEClusterName, + "US_SOUTH_ZONE": config.USSouthZone, + "US_SOUTH_CLUSTER_NAME": config.USSouthClusterName, + "JP_TOK_ZONE": config.JPTokZone, + "JP_TOK_CLUSTER_NAME": config.JPTokClusterName, + "SSH_FILE_PATH": config.SSHFilePath, + "SSH_FILE_PATH_TWO": config.SSHFilePathTwo, + "SCHEDULER": config.Scheduler, + "SCCWP_ENABLED": config.SccWPEnabled, + "CSPM_ENABLED": config.CspmEnabled, + "SCCWP_SERVICE_PLAN": config.SccwpServicePlan, + "APP_CONFIG_PLAN": config.AppConfigPlan, + "OBSERVABILITY_MONITORING_ENABLE": config.ObservabilityMonitoringEnable, + "OBSERVABILITY_MONITORING_ON_COMPUTE_NODES_ENABLE": config.ObservabilityMonitoringOnComputeNodesEnable, + "OBSERVABILITY_ATRACKER_ENABLE": config.ObservabilityAtrackerEnable, + "OBSERVABILITY_ATRACKER_TARGET_TYPE": config.ObservabilityAtrackerTargetType, + "OBSERVABILITY_LOGS_ENABLE_FOR_MANAGEMENT": config.ObservabilityLogsEnableForManagement, + "OBSERVABILITY_LOGS_ENABLE_FOR_COMPUTE": config.ObservabilityLogsEnableForCompute, + "OBSERVABILITY_ENABLE_PLATFORM_LOGS": config.ObservabilityEnablePlatformLogs, + "OBSERVABILITY_ENABLE_METRICS_ROUTING": config.ObservabilityEnableMetricsRouting, + "OBSERVABILITY_LOGS_RETENTION_PERIOD": config.ObservabilityLogsRetentionPeriod, + "OBSERVABILITY_MONITORING_PLAN": config.ObservabilityMonitoringPlan, + "ENABLE_COS_INTEGRATION": config.EnableCosIntegration, + "CUSTOM_FILE_SHARES": config.CustomFileShares, + "PLACEMENT_STRATEGY": config.PlacementStrategy, + "MANAGEMENT_INSTANCES": config.ManagementInstances, + "MANAGEMENT_INSTANCES_IMAGE": config.ManagementInstancesImage, + "STATIC_COMPUTE_INSTANCES_IMAGE": config.StaticComputeInstancesImage, + "DYNAMIC_COMPUTE_INSTANCES_IMAGE": config.DynamicComputeInstancesImage, + "APP_CENTER_GUI_PASSWORD": config.AppCenterGuiPassword, // pragma: allowlist secret + "LSF_VERSION": config.LsfVersion, + "LOGIN_INSTANCE": config.LoginInstance, + "ATTRACKER_TEST_ZONE": config.AttrackerTestZone, + } + + if err := processSliceConfigs(config, envVars); err != nil { + return fmt.Errorf("error processing slice configurations: %w", err) + } + + for key, value := range envVars { + if err := setEnvironmentVariable(key, value); err != nil { + return fmt.Errorf("failed to set environment variable %s: %w", key, err) + } + } + + return nil +} + +// processSliceConfigs handles the JSON marshaling of slice configurations +func processSliceConfigs(config *Config, envVars map[string]interface{}) error { + sliceProcessors := []struct { + name string + instances interface{} + }{ + {"STATIC_COMPUTE_INSTANCES", config.StaticComputeInstances}, + {"DYNAMIC_COMPUTE_INSTANCES", config.DynamicComputeInstances}, + {"MANAGEMENT_INSTANCES", config.ManagementInstances}, + {"LOGIN_INSTANCE", config.LoginInstance}, + {"CUSTOM_FILE_SHARES", config.CustomFileShares}, + {"LDAP_INSTANCE", config.LdapInstance}, + } + + for _, processor := range sliceProcessors { + if processor.name == "CUSTOM_FILE_SHARES" { + if err := checkFileShares(config.CustomFileShares); err != nil { + return err + } + } + if err := marshalToEnv(processor.name, processor.instances, envVars); err != nil { + return err + } + } + + return nil +} + +// checkFileShares validates file shares configuration +func checkFileShares(fileShares []CustomFileShare) error { + for _, share := range fileShares { + if share.MountPath == "" { + log.Printf("Warning: FileShares MountPath is empty in configuration") + } + } + return nil +} + +// marshalToEnv marshals data to JSON and stores it in envVars map +func marshalToEnv(key string, data interface{}, envVars map[string]interface{}) error { + jsonBytes, err := json.Marshal(data) + if err != nil { + return fmt.Errorf("failed to marshal %s: %w", key, err) + } + envVars[key] = string(jsonBytes) + return nil +} + +// setEnvironmentVariable sets a single environment variable with proper type handling +func setEnvironmentVariable(key string, value interface{}) error { + if value == nil { + return nil + } + + if existing := os.Getenv(key); existing != "" { + log.Printf("Environment variable %s is already set. Skipping overwrite.", key) + return nil + } + + if key == "REMOTE_ALLOWED_IPS" { + return handleRemoteAllowedIPs(value) + } + + switch v := value.(type) { + case string: + if v != "" { + return os.Setenv(key, v) + } + case bool: + return os.Setenv(key, strconv.FormatBool(v)) + case int: + return os.Setenv(key, strconv.Itoa(v)) + case float64: + return os.Setenv(key, strconv.FormatFloat(v, 'f', -1, 64)) + case []string: + if len(v) > 0 { + return os.Setenv(key, strings.Join(v, ",")) + } + default: + jsonBytes, err := json.Marshal(value) + if err != nil { + return fmt.Errorf("failed to marshal %s: %w", key, err) + } + return os.Setenv(key, string(jsonBytes)) + } + + return nil +} + +// handleARemoteAllowedIPs handles special case for the remote_allowed_ips environment variable. +func handleRemoteAllowedIPs(value interface{}) error { + // Assert value is of type string + cidr, ok := value.(string) + if !ok { + return fmt.Errorf("remote_allowed_ips must be a string") + } + + // Handle default/empty CIDR + if cidr == "" || cidr == "0.0.0.0/0" { + if globalIP == "" { + return fmt.Errorf("globalIP is empty, cannot set REMOTE_ALLOWED_IPS") + } + return os.Setenv("REMOTE_ALLOWED_IPS", globalIP+"/32") + } + + // Set environment variable + return os.Setenv("REMOTE_ALLOWED_IPS", cidr) +} diff --git a/tests/go.mod b/tests/go.mod index 8c4147ab..eb935b78 100644 --- a/tests/go.mod +++ b/tests/go.mod @@ -1,28 +1,28 @@ module github.com/terraform-ibm-modules/terraform-ibm-hpc -go 1.23.4 +go 1.24.2 -toolchain go1.24.0 +toolchain go1.24.3 require ( - github.com/IBM/go-sdk-core/v5 v5.19.0 - github.com/IBM/secrets-manager-go-sdk/v2 v2.0.10 - github.com/gruntwork-io/terratest v0.48.2 + github.com/IBM/go-sdk-core/v5 v5.20.0 + github.com/IBM/secrets-manager-go-sdk/v2 v2.0.11 + github.com/gruntwork-io/terratest v0.49.0 github.com/stretchr/testify v1.10.0 - github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.48.2 - golang.org/x/crypto v0.37.0 + github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.50.1 + golang.org/x/crypto v0.39.0 gopkg.in/yaml.v3 v3.0.1 ) require ( - dario.cat/mergo v1.0.1 // indirect + dario.cat/mergo v1.0.0 // indirect github.com/IBM-Cloud/bluemix-go v0.0.0-20240719075425-078fcb3a55be // indirect github.com/IBM-Cloud/power-go-client v1.11.0 // indirect github.com/IBM/cloud-databases-go-sdk v0.7.1 // indirect - github.com/IBM/platform-services-go-sdk v0.79.0 // indirect + github.com/IBM/platform-services-go-sdk v0.81.1 // indirect github.com/IBM/project-go-sdk v0.3.6 // indirect github.com/IBM/schematics-go-sdk v0.4.0 // indirect - github.com/IBM/vpc-go-sdk v0.67.0 // indirect + github.com/IBM/vpc-go-sdk v0.68.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.1.6 // indirect github.com/agext/levenshtein v1.2.3 // indirect @@ -37,19 +37,19 @@ require ( github.com/ghodss/yaml v1.0.0 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/go-git/go-billy/v5 v5.6.2 // indirect - github.com/go-git/go-git/v5 v5.14.0 // indirect + github.com/go-git/go-git/v5 v5.16.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-openapi/analysis v0.23.0 // indirect + github.com/go-openapi/analysis v0.21.5 // indirect github.com/go-openapi/errors v0.22.1 // indirect - github.com/go-openapi/jsonpointer v0.21.1 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/loads v0.22.0 // indirect - github.com/go-openapi/runtime v0.28.0 // indirect - github.com/go-openapi/spec v0.21.0 // indirect + github.com/go-openapi/jsonpointer v0.20.1 // indirect + github.com/go-openapi/jsonreference v0.20.3 // indirect + github.com/go-openapi/loads v0.21.3 // indirect + github.com/go-openapi/runtime v0.26.0 // indirect + github.com/go-openapi/spec v0.20.12 // indirect github.com/go-openapi/strfmt v0.23.0 // indirect - github.com/go-openapi/swag v0.23.1 // indirect - github.com/go-openapi/validate v0.24.0 // indirect + github.com/go-openapi/swag v0.22.5 // indirect + github.com/go-openapi/validate v0.22.4 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.26.0 // indirect @@ -63,16 +63,16 @@ require ( github.com/hashicorp/go-retryablehttp v0.7.7 // indirect github.com/hashicorp/go-safetemp v1.0.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect - github.com/hashicorp/hcl/v2 v2.23.0 // indirect - github.com/hashicorp/terraform-json v0.24.0 // indirect + github.com/hashicorp/hcl/v2 v2.22.0 // indirect + github.com/hashicorp/terraform-json v0.25.0 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/jinzhu/copier v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect - github.com/klauspost/compress v1.18.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect github.com/leodido/go-urn v1.4.0 // indirect - github.com/mailru/easyjson v0.9.0 // indirect - github.com/mattn/go-zglob v0.0.6 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-zglob v0.0.4 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/go-testing-interface v1.14.1 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect @@ -83,22 +83,21 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect - github.com/tmccombs/hcl2json v0.6.7 // indirect - github.com/ulikunitz/xz v0.5.12 // indirect + github.com/tmccombs/hcl2json v0.6.4 // indirect + github.com/ulikunitz/xz v0.5.11 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect github.com/zclconf/go-cty v1.16.2 // indirect go.mongodb.org/mongo-driver v1.17.3 // indirect - go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/sync v0.13.0 // indirect - golang.org/x/sys v0.32.0 // indirect - golang.org/x/text v0.24.0 // indirect - golang.org/x/tools v0.32.0 // indirect - google.golang.org/protobuf v1.36.1 // indirect + go.opentelemetry.io/otel v1.29.0 // indirect + go.opentelemetry.io/otel/metric v1.29.0 // indirect + go.opentelemetry.io/otel/trace v1.29.0 // indirect + golang.org/x/mod v0.25.0 // indirect + golang.org/x/net v0.40.0 // indirect + golang.org/x/sync v0.15.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/text v0.26.0 // indirect + golang.org/x/tools v0.33.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/tests/go.sum b/tests/go.sum index 93a6e0f3..80416288 100644 --- a/tests/go.sum +++ b/tests/go.sum @@ -1,23 +1,23 @@ -dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= -dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= +dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/IBM-Cloud/bluemix-go v0.0.0-20240719075425-078fcb3a55be h1:USOcBHkYQ4o/ccoEvoHinrba8NQthLJpFXnAoBY+MI4= github.com/IBM-Cloud/bluemix-go v0.0.0-20240719075425-078fcb3a55be/go.mod h1:/7hMjdZA6fEpd/dQAOEABxKEwN0t72P3PlpEDu0Y7bE= github.com/IBM-Cloud/power-go-client v1.11.0 h1:4xlYXF2+S3s6Crb0D2+d5c1kb6gUE7eowMXLB7Q6cWY= github.com/IBM-Cloud/power-go-client v1.11.0/go.mod h1:UDyXeIKEp6r7yWUXYu3r0ZnFSlNZ2YeQTHwM2Tmlgv0= github.com/IBM/cloud-databases-go-sdk v0.7.1 h1:5kK4/3NUsGxZzmuUe+1ftajpOQbeDVh5VeemrPgROP4= github.com/IBM/cloud-databases-go-sdk v0.7.1/go.mod h1:JYucI1PdwqbAd8XGdDAchxzxRP7bxOh1zUnseovHKsc= -github.com/IBM/go-sdk-core/v5 v5.19.0 h1:YN2S5JUvq/EwYulmcNFwgyYBxZhVWl9nkY22H7Hpghw= -github.com/IBM/go-sdk-core/v5 v5.19.0/go.mod h1:deZO1J5TSlU69bCnl/YV7nPxFZA2UEaup7cq/7ZTOgw= -github.com/IBM/platform-services-go-sdk v0.79.0 h1:qCNheB3390holPcpDxdgNyi11JS6ZfsL39YgnJEOsTo= -github.com/IBM/platform-services-go-sdk v0.79.0/go.mod h1:FzCPOfbNAt0s9RwtIrbJbfDwA7mKIObtZ/18KnviKr0= +github.com/IBM/go-sdk-core/v5 v5.20.0 h1:rG1fn5GmJfFzVtpDKndsk6MgcarluG8YIWf89rVqLP8= +github.com/IBM/go-sdk-core/v5 v5.20.0/go.mod h1:Q3BYO6iDA2zweQPDGbNTtqft5tDcEpm6RTuqMlPcvbw= +github.com/IBM/platform-services-go-sdk v0.81.1 h1:Ch9wUIigyA3HzW7MQnA1WTHAw+QA6W4bSP3ThgzDpx0= +github.com/IBM/platform-services-go-sdk v0.81.1/go.mod h1:XOowH+JnIih3FA7uilLVM/9VH7XgCmJ4T/i6eZi7gkw= github.com/IBM/project-go-sdk v0.3.6 h1:DRiANKnAePevFsIKSvR89SUaMa2xsd7YKK71Ka1eqKI= github.com/IBM/project-go-sdk v0.3.6/go.mod h1:FOJM9ihQV3EEAY6YigcWiTNfVCThtdY8bLC/nhQHFvo= github.com/IBM/schematics-go-sdk v0.4.0 h1:x01f/tPquYJYLQzJLGuxWfCbV/EdSMXRikOceNy/JLM= github.com/IBM/schematics-go-sdk v0.4.0/go.mod h1:Xe7R7xgwmXBHu09w2CbBe8lkWZaYxNQo19bS4dpLrUA= -github.com/IBM/secrets-manager-go-sdk/v2 v2.0.10 h1:R9ZMCCi7yJnDIe88+UKKQf0CFBB74E6k8mOp+++kL4w= -github.com/IBM/secrets-manager-go-sdk/v2 v2.0.10/go.mod h1:Bmy0woaAxxNPVHCqusarnTZVyVMnLRVwemF6gvGHcLo= -github.com/IBM/vpc-go-sdk v0.67.0 h1:p8G5bqTUyVheBrJpT+pLpoZoA/Yu1R2xX4xJLM4tT9w= -github.com/IBM/vpc-go-sdk v0.67.0/go.mod h1:VL7sy61ybg6tvA60SepoQx7TFe20m7JyNUt+se2tHP4= +github.com/IBM/secrets-manager-go-sdk/v2 v2.0.11 h1:RG/hnKvKSMrG3X5Jm/P/itg+y/FGPY7+B5N3XYQDbmQ= +github.com/IBM/secrets-manager-go-sdk/v2 v2.0.11/go.mod h1:7r0LOxg+K/y2fVbh2Uopu5r+VE76p1VTk/3gHAs5MQk= +github.com/IBM/vpc-go-sdk v0.68.0 h1:Zs65PWeWBG5IwafAJV0RdPVsi3hCjIkhFZkqr1sLt5g= +github.com/IBM/vpc-go-sdk v0.68.0/go.mod h1:VL7sy61ybg6tvA60SepoQx7TFe20m7JyNUt+se2tHP4= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= @@ -70,8 +70,8 @@ github.com/go-git/go-billy/v5 v5.6.2 h1:6Q86EsPXMa7c3YZ3aLAQsMA0VlWmy43r6FHqa/UN github.com/go-git/go-billy/v5 v5.6.2/go.mod h1:rcFC2rAsp/erv7CMz9GczHcuD0D32fWzH+MJAU+jaUU= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= -github.com/go-git/go-git/v5 v5.14.0 h1:/MD3lCrGjCen5WfEAzKg00MJJffKhC8gzS80ycmCi60= -github.com/go-git/go-git/v5 v5.14.0/go.mod h1:Z5Xhoia5PcWA3NF8vRLURn9E5FRhSl7dGj9ItW3Wk5k= +github.com/go-git/go-git/v5 v5.16.0 h1:k3kuOEpkc0DeY7xlL6NaaNg39xdgQbtH5mwCafHO9AQ= +github.com/go-git/go-git/v5 v5.16.0/go.mod h1:4Ge4alE/5gPs30F2H1esi2gPd69R0C39lolkucHBOp8= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -80,28 +80,28 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-openapi/analysis v0.23.0 h1:aGday7OWupfMs+LbmLZG4k0MYXIANxcuBTYUC03zFCU= -github.com/go-openapi/analysis v0.23.0/go.mod h1:9mz9ZWaSlV8TvjQHLl2mUW2PbZtemkE8yA5v22ohupo= +github.com/go-openapi/analysis v0.21.5 h1:3tHfEBh6Ia8eKc4M7khOGjPOAlWKJ10d877Cr9teujI= +github.com/go-openapi/analysis v0.21.5/go.mod h1:25YcZosX9Lwz2wBsrFrrsL8bmjjXdlyP6zsr2AMy29M= github.com/go-openapi/errors v0.20.3/go.mod h1:Z3FlZ4I8jEGxjUK+bugx3on2mIAk4txuAOhlsB1FSgk= github.com/go-openapi/errors v0.22.1 h1:kslMRRnK7NCb/CvR1q1VWuEQCEIsBGn5GgKD9e+HYhU= github.com/go-openapi/errors v0.22.1/go.mod h1:+n/5UdIqdVnLIJ6Q9Se8HNGUXYaY6CN8ImWzfi/Gzp0= -github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= -github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/loads v0.22.0 h1:ECPGd4jX1U6NApCGG1We+uEozOAvXvJSF4nnwHZ8Aco= -github.com/go-openapi/loads v0.22.0/go.mod h1:yLsaTCS92mnSAZX5WWoxszLj0u+Ojl+Zs5Stn1oF+rs= -github.com/go-openapi/runtime v0.28.0 h1:gpPPmWSNGo214l6n8hzdXYhPuJcGtziTOgUpvsFWGIQ= -github.com/go-openapi/runtime v0.28.0/go.mod h1:QN7OzcS+XuYmkQLw05akXk0jRH/eZ3kb18+1KwW9gyc= -github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9ZY= -github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk= +github.com/go-openapi/jsonpointer v0.20.1 h1:MkK4VEIEZMj4wT9PmjaUmGflVBr9nvud4Q4UVFbDoBE= +github.com/go-openapi/jsonpointer v0.20.1/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= +github.com/go-openapi/jsonreference v0.20.3 h1:EjGcjTW8pD1mRis6+w/gmoBdqv5+RbE9B85D1NgDOVQ= +github.com/go-openapi/jsonreference v0.20.3/go.mod h1:FviDZ46i9ivh810gqzFLl5NttD5q3tSlMLqLr6okedM= +github.com/go-openapi/loads v0.21.3 h1:8sSH2FIm/SnbDUGv572md4YqVMFne/a9Eubvcd3anew= +github.com/go-openapi/loads v0.21.3/go.mod h1:Y3aMR24iHbKHppOj91nQ/SHc0cuPbAr4ndY4a02xydc= +github.com/go-openapi/runtime v0.26.0 h1:HYOFtG00FM1UvqrcxbEJg/SwvDRvYLQKGhw2zaQjTcc= +github.com/go-openapi/runtime v0.26.0/go.mod h1:QgRGeZwrUcSHdeh4Ka9Glvo0ug1LC5WyE+EV88plZrQ= +github.com/go-openapi/spec v0.20.12 h1:cgSLbrsmziAP2iais+Vz7kSazwZ8rsUZd6TUzdDgkVI= +github.com/go-openapi/spec v0.20.12/go.mod h1:iSCgnBcwbMW9SfzJb8iYynXvcY6C/QFrI7otzF7xGM4= github.com/go-openapi/strfmt v0.21.7/go.mod h1:adeGTkxE44sPyLk0JV235VQAO/ZXUr8KAzYjclFs3ew= github.com/go-openapi/strfmt v0.23.0 h1:nlUS6BCqcnAk0pyhi9Y+kdDVZdZMHfEKQiS4HaMgO/c= github.com/go-openapi/strfmt v0.23.0/go.mod h1:NrtIpfKtWIygRkKVsxh7XQMDQW5HKQl6S5ik2elW+K4= -github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= -github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= -github.com/go-openapi/validate v0.24.0 h1:LdfDKwNbpB6Vn40xhTdNZAnfLECL81w+VX3BumrGD58= -github.com/go-openapi/validate v0.24.0/go.mod h1:iyeX1sEufmv3nPbBdX3ieNviWnOZaJ1+zquzJEf2BAQ= +github.com/go-openapi/swag v0.22.5 h1:fVS63IE3M0lsuWRzuom3RLwUMVI2peDH01s6M70ugys= +github.com/go-openapi/swag v0.22.5/go.mod h1:Gl91UqO+btAM0plGGxHqJcQZ1ZTy6jbmridBTsDy8A0= +github.com/go-openapi/validate v0.22.4 h1:5v3jmMyIPKTR8Lv9syBAIRxG6lY0RqeBPB1LKEijzk8= +github.com/go-openapi/validate v0.22.4/go.mod h1:qm6O8ZIcPVdSY5219468Jv7kBdGvkiZLPOmqnqTUZ2A= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= @@ -141,8 +141,8 @@ github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLe github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gruntwork-io/terratest v0.48.2 h1:+VwfODchq8jxZZWD+s8gBlhD1z6/C4bFLNrhpm9ONrs= -github.com/gruntwork-io/terratest v0.48.2/go.mod h1:Y5ETyD4ZQ2MZhasPno272fWuCpKwvTPYDi8Y0tIMqTE= +github.com/gruntwork-io/terratest v0.49.0 h1:GurfpHEOEr8vntB77QcxDh+P7aiQRUgPFdgb6q9PuWI= +github.com/gruntwork-io/terratest v0.49.0/go.mod h1:/+dfGio9NqUpvvukuPo29B8zy6U5FYJn9PdmvwztK4A= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -160,10 +160,10 @@ github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhE github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.7.0 h1:5tqGy27NaOTB8yJKUZELlFAS/LTKJkrmONwQKeRZfjY= github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= -github.com/hashicorp/hcl/v2 v2.23.0 h1:Fphj1/gCylPxHutVSEOf2fBOh1VE4AuLV7+kbJf3qos= -github.com/hashicorp/hcl/v2 v2.23.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= -github.com/hashicorp/terraform-json v0.24.0 h1:rUiyF+x1kYawXeRth6fKFm/MdfBS6+lW4NbeATsYz8Q= -github.com/hashicorp/terraform-json v0.24.0/go.mod h1:Nfj5ubo9xbu9uiAoZVBsNOjvNKB66Oyrvtit74kC7ow= +github.com/hashicorp/hcl/v2 v2.22.0 h1:hkZ3nCtqeJsDhPRFz5EA9iwcG1hNWGePOTw6oyul12M= +github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= +github.com/hashicorp/terraform-json v0.25.0 h1:rmNqc/CIfcWawGiwXmRuiXJKEiJu1ntGoxseG1hLhoQ= +github.com/hashicorp/terraform-json v0.25.0/go.mod h1:sMKS8fiRDX4rVlR6EJUMudg1WcanxCMoWwTLkgZP/vc= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= @@ -175,8 +175,8 @@ github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFF github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4= github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -186,14 +186,14 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= -github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= -github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-zglob v0.0.6 h1:mP8RnmCgho4oaUYDIDn6GNxYk+qJGUs8fJLn+twYj2A= -github.com/mattn/go-zglob v0.0.6/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY= +github.com/mattn/go-zglob v0.0.4 h1:LQi2iOm0/fGgu80AioIJ/1j9w9Oh+9DZ39J4VAGzHQM= +github.com/mattn/go-zglob v0.0.4/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-testing-interface v1.14.1 h1:jrgshOhYAUVNMAJiKbEu7EqAwgJJ2JqpQmpLJOu07cU= @@ -250,8 +250,8 @@ github.com/onsi/gomega v1.27.8/go.mod h1:2J8vzI/s+2shY9XHRApDkdgPo1TKT7P2u6fXeJK github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/onsi/gomega v1.31.1/go.mod h1:y40C95dwAD1Nz36SsEnxvfFe8FFfNxzI5eJ0EYGyAy0= -github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= -github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= +github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= +github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= @@ -282,13 +282,13 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.48.2 h1:pc2mBIfQCflHkxTunSAvAcSvM7uhDp6oMZ3L6hBj90w= -github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.48.2/go.mod h1:6bjkgzKQBYsIX7+tSQjB4C1NEq3qQBKJ/0LD8OGZffg= +github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.50.1 h1:5t2x8tkTeEeLrVy141bLVTWfd8zC9pvidByXJxUH6k8= +github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper v1.50.1/go.mod h1:DPxpxzMr8GCuuUzNlNWdAFAHfHRv1mETuEs2G47+7+M= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= -github.com/tmccombs/hcl2json v0.6.7 h1:RYKTs4kd/gzRsEiv7J3M2WQ7TYRYZVc+0H0pZdERkxA= -github.com/tmccombs/hcl2json v0.6.7/go.mod h1:lJgBOOGDpbhjvdG2dLaWsqB4KBzul2HytfDTS3H465o= -github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= -github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/tmccombs/hcl2json v0.6.4 h1:/FWnzS9JCuyZ4MNwrG4vMrFrzRgsWEOVi+1AyYUVLGw= +github.com/tmccombs/hcl2json v0.6.4/go.mod h1:+ppKlIW3H5nsAsZddXPy2iMyvld3SHxyjswOZhavRDk= +github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8= +github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= @@ -305,16 +305,14 @@ github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmB go.mongodb.org/mongo-driver v1.11.3/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g= go.mongodb.org/mongo-driver v1.17.3 h1:TQyXhnsWfWtgAhMtOgtYHMTkZIfBTpMTsMnd9ZBeHxQ= go.mongodb.org/mongo-driver v1.17.3/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= +go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= +go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= +go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= +go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -326,8 +324,8 @@ golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98y golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= -golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= -golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= +golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -338,8 +336,8 @@ golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= -golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= +golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -365,8 +363,8 @@ golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= +golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -376,8 +374,8 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= +golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -413,8 +411,8 @@ golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -429,8 +427,8 @@ golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= -golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= -golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= +golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= +golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -445,8 +443,8 @@ golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= @@ -461,8 +459,8 @@ golang.org/x/tools v0.9.3/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= golang.org/x/tools v0.12.0/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= -golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= -golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= +golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= +golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -476,8 +474,8 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= -google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -497,3 +495,5 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/tests/hpc_config.yml b/tests/hpc_config.yml deleted file mode 100644 index ab2abddd..00000000 --- a/tests/hpc_config.yml +++ /dev/null @@ -1,41 +0,0 @@ -solution: hpc -default_resource_group: Default -non_default_resource_group: WES_TEST -zone: us-east-3 -cluster_name: HPC-LSF-1 -reservation_id: -remote_allowed_ips: -ssh_key: geretain-hpc -login_node_instance_type: bx2-2x8 -login_image_name: hpcaas-lsf10-rhel88-compute-v7 -management_image_name: hpcaas-lsf10-rhel88-v11 -compute_image_name: hpcaas-lsf10-rhel88-compute-v7 -management_node_instance_type: bx2-2x8 -management_node_count: 2 -enable_vpc_flow_logs: false -key_management: key_protect -kms_instance_name: -kms_key_name: -hyperthreading_enabled: true -dns_domain_name: wes.com -enable_app_center: true -app_center_gui_pwd: Pass@123 # pragma: allowlist secret -enable_ldap: true -ldap_basedns: cicd.com -ldap_admin_password: Pass@123 # pragma: allowlist secret -ldap_user_name: tester -ldap_user_password: Pass@123 # pragma: allowlist secret -us_east_zone: us-east-3 -us_east_reservation_id: -us_east_cluster_name: HPC-LSF-2 -eu_de_zone: eu-de-3 -eu_de_reservation_id: -eu_de_cluster_name: HPC-LSF-1 -us_south_zone: us-south-1 -us_south_reservation_id: -jp_tok_zone: jp-tok-1 -jp_tok_cluster_name: HPC-LSF-2 -jp_tok_reservation_id: -us_south_cluster_name: HPC-LSF-1 -ssh_file_path: /artifacts/.ssh/id_rsa -ssh_file_path_two: /artifacts/.ssh/id_rsa diff --git a/tests/logs/output.txt b/tests/logs_output/output.txt similarity index 100% rename from tests/logs/output.txt rename to tests/logs_output/output.txt diff --git a/tests/lsf/cluster_helpers.go b/tests/lsf/cluster_helpers.go index bb2d0754..2dbd1639 100644 --- a/tests/lsf/cluster_helpers.go +++ b/tests/lsf/cluster_helpers.go @@ -12,31 +12,30 @@ import ( ) // VerifyManagementNodeConfig verifies the configuration of a management node by performing various checks. -// It checks the cluster ID, master name, Reservation ID, MTU, IP route, hyperthreading, LSF version, solution, Run tasks and file mount. +// It checks the cluster ID, master name, MTU, IP route, hyperthreading, LSF version, Run tasks and file mount. // The results of the checks are logged using the provided logger. func VerifyManagementNodeConfig( t *testing.T, sshMgmtClient *ssh.Client, - expectedClusterName, expectedMasterName, expectedReservationID string, + clusterPrefix string, expectedHyperthreadingStatus bool, managementNodeIPList []string, lsfVersion string, - solution string, logger *utils.AggregatedLogger, ) { - // Verify Cluster ID - checkClusterNameErr := LSFCheckClusterName(t, sshMgmtClient, expectedClusterName, logger) - utils.LogVerificationResult(t, checkClusterNameErr, "Check Cluster ID on management node", logger) + + // Validate LSF health on the management node + healthCheckErr := LSFHealthCheck(t, sshMgmtClient, logger) + utils.LogVerificationResult(t, healthCheckErr, "Validate LSF health on management node", logger) + + // Verify cluster name + clusterNameErr := LSFCheckClusterName(t, sshMgmtClient, clusterPrefix, logger) + utils.LogVerificationResult(t, clusterNameErr, "Verify cluster name on management node", logger) // Verify Master Name - checkMasterNameErr := LSFCheckMasterName(t, sshMgmtClient, expectedMasterName, logger) + checkMasterNameErr := LSFCheckMasterName(t, sshMgmtClient, clusterPrefix, logger) utils.LogVerificationResult(t, checkMasterNameErr, "Check Master Name on management node", logger) - // Verify Reservation ID if the solution contains "hpc" - if strings.Contains(solution, "hpc") { - reservationIDErr := HPCCheckReservationID(t, sshMgmtClient, expectedReservationID, logger) - utils.LogVerificationResult(t, reservationIDErr, "Check Reservation ID on management node", logger) - } // MTU check for management nodes mtuCheckErr := LSFMTUCheck(t, sshMgmtClient, managementNodeIPList, logger) utils.LogVerificationResult(t, mtuCheckErr, "MTU check on management node", logger) @@ -54,7 +53,7 @@ func VerifyManagementNodeConfig( utils.LogVerificationResult(t, versionErr, "check LSF version on management node", logger) //File Mount - fileMountErr := HPCCheckFileMount(t, sshMgmtClient, managementNodeIPList, "management", logger) + fileMountErr := CheckFileMount(t, sshMgmtClient, managementNodeIPList, "management", logger) utils.LogVerificationResult(t, fileMountErr, "File mount check on management node", logger) } @@ -155,11 +154,11 @@ func VerifyComputeNodeConfig( logger *utils.AggregatedLogger, ) { - // MTU check for management nodes + // MTU check for compute nodes mtuCheckErr := LSFMTUCheck(t, sshMgmtClient, computeNodeIPList, logger) utils.LogVerificationResult(t, mtuCheckErr, "MTU check on compute node", logger) - // IP route check for management nodes + // IP route check for compute nodes ipRouteCheckErr := LSFIPRouteCheck(t, sshMgmtClient, computeNodeIPList, logger) utils.LogVerificationResult(t, ipRouteCheckErr, "IP route check on compute node", logger) @@ -168,7 +167,7 @@ func VerifyComputeNodeConfig( utils.LogVerificationResult(t, hyperthreadErr, "Hyperthreading check on compute node", logger) // File mount - fileMountErr := HPCCheckFileMount(t, sshMgmtClient, computeNodeIPList, "compute", logger) + fileMountErr := CheckFileMount(t, sshMgmtClient, computeNodeIPList, "compute", logger) utils.LogVerificationResult(t, fileMountErr, "File mount check on compute node", logger) // Intel One mpi @@ -190,68 +189,68 @@ func VerifyAPPCenterConfig( } -// VerifyLoginNodeConfig verifies the configuration of a login node by performing various checks. -// It checks the cluster ID, master name, Reservation ID, MTU, IP route, hyperthreading, LSF version, Run tasks and file mount. -// The results of the checks are logged using the provided logger. +// VerifyLoginNodeConfig validates the configuration of a login node by performing multiple checks. +// It verifies the cluster name, master node name, MTU settings, IP routing, hyperthreading status, +// LSF version, file mounts, job execution, and LSF command availability. +// All results are logged using the provided logger. func VerifyLoginNodeConfig( t *testing.T, sshLoginClient *ssh.Client, - expectedClusterName, expectedMasterName, expectedReservationID string, + clusterPrefix string, expectedHyperthreadingStatus bool, loginNodeIP string, jobCommand string, lsfVersion string, logger *utils.AggregatedLogger, ) { + // Verify cluster name + clusterNameErr := LSFCheckClusterName(t, sshLoginClient, clusterPrefix, logger) + utils.LogVerificationResult(t, clusterNameErr, "Verify cluster name on login node", logger) - // Verify cluster ID - checkClusterNameErr := LSFCheckClusterName(t, sshLoginClient, expectedClusterName, logger) - utils.LogVerificationResult(t, checkClusterNameErr, "check Cluster ID on login node", logger) + // Verify master node name + masterNameErr := LSFCheckMasterName(t, sshLoginClient, clusterPrefix, logger) + utils.LogVerificationResult(t, masterNameErr, "Verify master node name on login node", logger) - // Verify master name - checkMasterNameErr := LSFCheckMasterName(t, sshLoginClient, expectedMasterName, logger) - utils.LogVerificationResult(t, checkMasterNameErr, "check Master name on login node", logger) + // Check MTU configuration + mtuErr := LSFMTUCheck(t, sshLoginClient, []string{loginNodeIP}, logger) + utils.LogVerificationResult(t, mtuErr, "Verify MTU configuration on login node", logger) - // MTU check for login nodes - mtuCheckErr := LSFMTUCheck(t, sshLoginClient, []string{loginNodeIP}, logger) - utils.LogVerificationResult(t, mtuCheckErr, "MTU check on login node", logger) + // Check IP routing + ipRouteErr := LSFIPRouteCheck(t, sshLoginClient, []string{loginNodeIP}, logger) + utils.LogVerificationResult(t, ipRouteErr, "Verify IP routing on login node", logger) - // IP route check for login nodes - ipRouteCheckErr := LSFIPRouteCheck(t, sshLoginClient, []string{loginNodeIP}, logger) - utils.LogVerificationResult(t, ipRouteCheckErr, "IP route check on login node", logger) - - // Hyperthreading check - hyperthreadErr := LSFCheckHyperthreading(t, sshLoginClient, expectedHyperthreadingStatus, logger) - utils.LogVerificationResult(t, hyperthreadErr, "Hyperthreading check on login node", logger) + // Check hyperthreading status + hyperthreadingErr := LSFCheckHyperthreading(t, sshLoginClient, expectedHyperthreadingStatus, logger) + utils.LogVerificationResult(t, hyperthreadingErr, "Verify hyperthreading status on login node", logger) - // LSF version check + // Check LSF version versionErr := CheckLSFVersion(t, sshLoginClient, lsfVersion, logger) - utils.LogVerificationResult(t, versionErr, "check LSF version", logger) + utils.LogVerificationResult(t, versionErr, "Verify LSF version on login node", logger) - //File Mount - fileMountErr := HPCCheckFileMount(t, sshLoginClient, []string{loginNodeIP}, "login", logger) - utils.LogVerificationResult(t, fileMountErr, "File mount check on login node", logger) + // Verify file mounts + fileMountErr := CheckFileMount(t, sshLoginClient, []string{loginNodeIP}, "login", logger) + utils.LogVerificationResult(t, fileMountErr, "Verify file mounts on login node", logger) - //Run job - jobErr := LSFRunJobs(t, sshLoginClient, LOGIN_NODE_EXECUTION_PATH+jobCommand, logger) //Added the executable path - utils.LogVerificationResult(t, jobErr, "check Run job on login node", logger) + // Execute test job + jobExecutionErr := LSFRunJobs(t, sshLoginClient, LOGIN_NODE_EXECUTION_PATH+jobCommand, logger) + utils.LogVerificationResult(t, jobExecutionErr, "Verify job execution on login node", logger) - // Verify LSF commands + // Verify LSF commands availability lsfCmdErr := VerifyLSFCommands(t, sshLoginClient, "login", logger) - utils.LogVerificationResult(t, lsfCmdErr, "Check the 'lsf' command on the login node", logger) + utils.LogVerificationResult(t, lsfCmdErr, "Verify availability of LSF commands on login node", logger) } // VerifyTestTerraformOutputs is a function that verifies the Terraform outputs for a test scenario. func VerifyTestTerraformOutputs( t *testing.T, - LastTestTerraformOutputs map[string]interface{}, - isAPPCenterEnabled bool, + bastionIP, deployerIP string, + isCloudLogEnabled, isCloudMonitoringEnabled bool, ldapServerEnabled bool, logger *utils.AggregatedLogger, ) { // Check the Terraform logger outputs - outputErr := VerifyTerraformOutputs(t, LastTestTerraformOutputs, isAPPCenterEnabled, ldapServerEnabled, logger) + outputErr := ValidateTerraformOutput(t, bastionIP, deployerIP, isCloudLogEnabled, isCloudMonitoringEnabled, ldapServerEnabled, logger) utils.LogVerificationResult(t, outputErr, "check terraform outputs", logger) } @@ -278,7 +277,7 @@ func VerifyNoVNCConfig( ) { // Verify noVNC center - appCenterErr := HPCCheckNoVNC(t, sshMgmtClient, logger) + appCenterErr := LSFCheckNoVNC(t, sshMgmtClient, logger) utils.LogVerificationResult(t, appCenterErr, "check noVnc", logger) } @@ -340,7 +339,7 @@ func VerifyManagementNodeLDAPConfig( }() // Check file mount - if err := HPCCheckFileMountAsLDAPUser(t, sshLdapClient, "management", logger); err != nil { + if err := CheckFileMountAsLDAPUser(t, sshLdapClient, "management", logger); err != nil { utils.LogVerificationResult(t, err, "File mount check as LDAP user on management node failed", logger) } @@ -407,7 +406,7 @@ func VerifyLoginNodeLDAPConfig( }() // Check file mount - if err := HPCCheckFileMountAsLDAPUser(t, sshLdapClient, "login", logger); err != nil { + if err := CheckFileMountAsLDAPUser(t, sshLdapClient, "login", logger); err != nil { utils.LogVerificationResult(t, err, "File mount check as LDAP user on login node failed", logger) } @@ -456,7 +455,7 @@ func VerifyComputeNodeLDAPConfig( utils.LogVerificationResult(t, ldapErr, "ldap configuration check on the compute node", logger) // Check file mount - fileMountErr := HPCCheckFileMountAsLDAPUser(t, sshLdapClient, "compute", logger) + fileMountErr := CheckFileMountAsLDAPUser(t, sshLdapClient, "compute", logger) utils.LogVerificationResult(t, fileMountErr, "check file mount as an LDAP user on the compute node", logger) // Verify LSF commands @@ -487,11 +486,21 @@ func CheckLDAPServerStatus(t *testing.T, sClient *ssh.Client, ldapAdminpassword, utils.LogVerificationResult(t, ldapErr, "ldap Server Status", logger) } -// VerifyPTRRecordsForManagementAndLoginNodes verifies PTR records for 'mgmt' or 'login' nodes and ensures their resolution via SSH. +// // VerifyPTRRecordsForManagementAndLoginNodes verifies PTR records for 'mgmt' or 'login' nodes and ensures their resolution via SSH. +// // It retrieves hostnames, performs nslookup to verify PTR records, and returns an error if any step fails. +// func VerifyPTRRecordsForManagementAndLoginNodes(t *testing.T, sClient *ssh.Client, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, loginNodeIP string, domainName string, logger *utils.AggregatedLogger) { +// // Call sub-function to verify PTR records +// err := verifyPTRRecords(t, sClient, publicHostName, publicHostIP, privateHostName, managementNodeIPList, loginNodeIP, domainName, logger) +// // Log the verification result +// utils.LogVerificationResult(t, err, "PTR Records For Management And Login Nodes", logger) + +// } + +// VerifyPTRRecordsForManagementAndLoginNodes verifies PTR records for 'mgmt' nodes and ensures their resolution via SSH. // It retrieves hostnames, performs nslookup to verify PTR records, and returns an error if any step fails. -func VerifyPTRRecordsForManagementAndLoginNodes(t *testing.T, sClient *ssh.Client, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, loginNodeIP string, domainName string, logger *utils.AggregatedLogger) { +func VerifyPTRRecordsForManagement(t *testing.T, sClient *ssh.Client, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, domainName string, logger *utils.AggregatedLogger) { // Call sub-function to verify PTR records - err := verifyPTRRecords(t, sClient, publicHostName, publicHostIP, privateHostName, managementNodeIPList, loginNodeIP, domainName, logger) + err := verifyPTRRecords(t, sClient, publicHostName, publicHostIP, privateHostName, managementNodeIPList, domainName, logger) // Log the verification result utils.LogVerificationResult(t, err, "PTR Records For Management And Login Nodes", logger) @@ -551,22 +560,22 @@ func VerifyCreateNewLdapUserAndManagementNodeLDAPConfig( ldapServerIP string, managementNodeIPList []string, jobCommand string, + ldapUserName string, ldapAdminPassword string, ldapDomainName string, - ldapUserName string, - ldapUserPassword string, newLdapUserName string, + newLdapUserPassword string, logger *utils.AggregatedLogger, ) { // Add a new LDAP user - if err := HPCAddNewLDAPUser(t, sldapClient, ldapAdminPassword, ldapDomainName, ldapUserName, newLdapUserName, logger); err != nil { + if err := LSFAddNewLDAPUser(t, sldapClient, ldapAdminPassword, ldapDomainName, ldapUserName, newLdapUserName, newLdapUserPassword, logger); err != nil { utils.LogVerificationResult(t, err, "add new LDAP user", logger) return } // Connect to the management node via SSH as the new LDAP user - sshLdapClientUser, err := utils.ConnectToHostAsLDAPUser(LSF_PUBLIC_HOST_NAME, bastionIP, managementNodeIPList[0], newLdapUserName, ldapUserPassword) + sshLdapClientUser, err := utils.ConnectToHostAsLDAPUser(LSF_PUBLIC_HOST_NAME, bastionIP, managementNodeIPList[0], newLdapUserName, newLdapUserPassword) if err != nil { utils.LogVerificationResult(t, err, "connect to the management node via SSH as the new LDAP user", logger) return @@ -673,39 +682,48 @@ func ValidateSCCInstance(t *testing.T, apiKey, region, resourceGroup, clusterPre } // VerifyCloudLogs validates the configuration and status of cloud logging services. -// It checks the correctness of cloud logs URLs from Terraform outputs and validates logging services for management and compute nodes. // The function logs verification results for each step and handles errors gracefully. // Parameters include test context, SSH client, cluster details, and logging configuration. // The function does not return values but logs outcomes for validation steps. func VerifyCloudLogs( t *testing.T, sshClient *ssh.Client, - expectedSolution string, LastTestTerraformOutputs map[string]interface{}, managementNodeIPList []string, staticWorkerNodeIPList []string, isCloudLogsEnabledForManagement, isCloudLogsEnabledForCompute bool, logger *utils.AggregatedLogger) { - // Verify cloud logs URL from Terraform outputs - err := VerifyCloudLogsURLFromTerraformOutput(t, LastTestTerraformOutputs, isCloudLogsEnabledForManagement, isCloudLogsEnabledForCompute, logger) - utils.LogVerificationResult(t, err, "cloud logs URL from Terraform outputs", logger) - // Verify Fluent Bit service for management nodes - mgmtErr := LSFFluentBitServiceForManagementNodes(t, sshClient, managementNodeIPList, isCloudLogsEnabledForManagement, logger) + mgmtErr := VerifyFluentBitServiceForManagementNodes(t, sshClient, managementNodeIPList, isCloudLogsEnabledForManagement, logger) utils.LogVerificationResult(t, mgmtErr, "Fluent Bit service for management nodes", logger) // Verify Fluent Bit service for compute nodes - compErr := LSFFluentBitServiceForComputeNodes(t, sshClient, expectedSolution, staticWorkerNodeIPList, isCloudLogsEnabledForCompute, logger) + compErr := VerifyFluentBitServiceForComputeNodes(t, sshClient, staticWorkerNodeIPList, isCloudLogsEnabledForCompute, logger) utils.LogVerificationResult(t, compErr, "Fluent Bit service for compute nodes", logger) } +// VerifyPlatformLogs validates whether platform logs are enabled or disabled. +// It uses the provided API key, region, and logger to check the platform log status. +// The result is logged using the aggregated logger. +func VerifyPlatformLogs( + t *testing.T, + apiKey, region, resourceGroup string, + isPlatformLogsEnabled bool, + logger *utils.AggregatedLogger, +) { + + err := VerifyPlatformStatus(t, apiKey, region, resourceGroup, isPlatformLogsEnabled, logger) + utils.LogVerificationResult(t, err, "Platform logs", logger) + +} + // ValidateDynamicNodeProfile validates the dynamic worker node profile by fetching it from Terraform variables // and comparing it against the expected profile obtained from IBM Cloud CLI. func ValidateDynamicNodeProfile(t *testing.T, apiKey, region, resourceGroup, clusterPrefix string, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedDynamicWorkerProfile, expectedWorkerNodeProfileErr := utils.GetFirstWorkerNodeInstanceType(t, options.TerraformVars, logger) - utils.LogVerificationResult(t, expectedWorkerNodeProfileErr, "Fetching worker node profile", logger) + expectedDynamicWorkerProfile, expectedWorkerNodeProfileErr := utils.GetFirstDynamicComputeProfile(t, options.TerraformVars, logger) + utils.LogVerificationResult(t, expectedWorkerNodeProfileErr, "Fetching dynamic worker node profile", logger) validateDynamicWorkerProfileErr := ValidateDynamicWorkerProfile(t, apiKey, region, resourceGroup, clusterPrefix, expectedDynamicWorkerProfile, logger) utils.LogVerificationResult(t, validateDynamicWorkerProfileErr, "Validating dynamic worker node profile", logger) @@ -713,30 +731,24 @@ func ValidateDynamicNodeProfile(t *testing.T, apiKey, region, resourceGroup, clu } // VerifyCloudMonitoring checks the cloud monitoring configuration and status. -// It validates cloud log URLs from Terraform outputs and monitoring services -// for management and compute nodes. The function logs verification results +// The function logs verification results // and handles errors gracefully. It takes test context, SSH client, cluster // details, monitoring flags, and a logger as parameters. No values are // returned; only validation outcomes are logged. func VerifyCloudMonitoring( t *testing.T, sshClient *ssh.Client, - expectedSolution string, LastTestTerraformOutputs map[string]interface{}, managementNodeIPList []string, staticWorkerNodeIPList []string, - isCloudMonitoringEnabledForManagement, isCloudMonitoringEnabledForCompute bool, + isCloudMonitoringEnableForManagement, isCloudMonitoringEnableForCompute bool, logger *utils.AggregatedLogger) { - // Verify cloud logs URL from Terraform outputs - err := VerifycloudMonitoringURLFromTerraformOutput(t, LastTestTerraformOutputs, isCloudMonitoringEnabledForManagement, isCloudMonitoringEnabledForCompute, logger) - utils.LogVerificationResult(t, err, "cloud logs URL from Terraform outputs", logger) - // Verify Prometheus Dragent service for management nodes - mgmtErr := LSFPrometheusAndDragentServiceForManagementNodes(t, sshClient, managementNodeIPList, isCloudMonitoringEnabledForManagement, logger) + mgmtErr := LSFPrometheusAndDragentServiceForManagementNodes(t, sshClient, managementNodeIPList, isCloudMonitoringEnableForManagement, logger) utils.LogVerificationResult(t, mgmtErr, "Prometheus and Dragent service for management nodes", logger) // Verify Dragent service for compute nodes - compErr := LSFDragentServiceForComputeNodes(t, sshClient, expectedSolution, staticWorkerNodeIPList, isCloudMonitoringEnabledForCompute, logger) + compErr := LSFDragentServiceForComputeNodes(t, sshClient, staticWorkerNodeIPList, isCloudMonitoringEnableForCompute, logger) utils.LogVerificationResult(t, compErr, "Prometheus and Dragent service for compute nodes", logger) } @@ -756,18 +768,13 @@ func ValidateAtracker(t *testing.T, apiKey, region, resourceGroup, clusterPrefix return // Exit early to prevent further errors } - // Ensure Target ID is set and has a valid length when Observability Atracker is enabled - trimmedTargetID := strings.TrimSpace(targetID) - if len(trimmedTargetID) <= 36 { - utils.LogVerificationResult(t, fmt.Errorf("target ID is either missing or too short (must be more than 36 characters)"), - "ValidateAtracker: Target ID invalid", logger) - return - } - // Validate the Atracker Route Target atrackerRouteTargetErr := ValidateAtrackerRouteTarget(t, apiKey, region, resourceGroup, clusterPrefix, targetID, targetType, logger) if atrackerRouteTargetErr != nil { utils.LogVerificationResult(t, atrackerRouteTargetErr, "ValidateAtracker: Validation failed for Atracker Route Target", logger) } + } else { + logger.Warn(t, "Cloud atracker is disabled - skipping validation of Atracker Route Target.") + } } diff --git a/tests/lsf/cluster_utils.go b/tests/lsf/cluster_utils.go index 98e71a4f..4d3aa2f8 100644 --- a/tests/lsf/cluster_utils.go +++ b/tests/lsf/cluster_utils.go @@ -15,6 +15,7 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" "golang.org/x/crypto/ssh" @@ -25,7 +26,11 @@ const ( timeOutForDynamicNodeDisappear = 15 * time.Minute jobCompletionWaitTime = 50 * time.Second dynamicNodeWaitTime = 3 * time.Minute - appCenterPort = 8443 +) + +const ( + LSFVersion14 = "fixpack_14" + LSFVersion15 = "fixpack_15" ) // LSFMTUCheck checks the MTU setting for multiple nodes of a specified type. @@ -234,58 +239,64 @@ func LSFRestartDaemons(t *testing.T, sClient *ssh.Client, logger *utils.Aggregat // LSFControlBctrld performs start or stop operations on the bctrld daemon on the specified machine. // It returns an error if any step fails or if an invalid value (other than 'start' or 'stop') is provided. // It executes the 'bctrld' command with the specified operation and waits for the daemon to start or stop. +// Only mgmt node lines are used to verify reachability. func LSFControlBctrld(t *testing.T, sClient *ssh.Client, startOrStop string, logger *utils.AggregatedLogger) error { - // Make startOrStop case-insensitive startOrStop = strings.ToLower(startOrStop) - // Validate the operation type if startOrStop != "start" && startOrStop != "stop" { return fmt.Errorf("invalid operation type. Please specify 'start' or 'stop'") } var command string - - // Construct the command based on the operation type if startOrStop == "stop" { command = "bctrld stop sbd" } else { command = "sudo su -l root -c 'systemctl restart lsfd'" } - // Execute the command if _, err := utils.RunCommandInSSHSession(sClient, command); err != nil { return fmt.Errorf("failed to run '%s' command: %w", command, err) } - // Sleep for a specified duration to allow time for the daemon to start or stop + // Sleep to give daemon time to settle if startOrStop == "stop" { - time.Sleep(63 * time.Second) + time.Sleep(90 * time.Second) } else { time.Sleep(120 * time.Second) } - // Check the status of the daemon using the 'bhosts -w' command on the remote SSH server + // Run bhosts and get status statusCmd := "bhosts -w" - out, err := utils.RunCommandInSSHSession(sClient, statusCmd) + output, err := utils.RunCommandInSSHSession(sClient, statusCmd) if err != nil { - return fmt.Errorf("failed to run 'bhosts' command: %w", err) + return fmt.Errorf("failed to run 'bhosts -w': %w", err) } - // Count the number of unreachable nodes - unreachCount := strings.Count(string(out), "unreach") + logger.DEBUG(t, fmt.Sprintf("startOrStop: %s", startOrStop)) + logger.DEBUG(t, fmt.Sprintf("bhosts -w Output:\n%s", string(output))) - // Check the output based on the startOrStop parameter - expectedUnreachCount := 0 - if startOrStop == "stop" { - expectedUnreachCount = 1 + // Filter only -mgmt- lines + lines := strings.Split(string(output), "\n") + unreachMgmtCount := 0 + for _, line := range lines { + if strings.Contains(line, "-mgmt-") && strings.Contains(line, "unreach") { + unreachMgmtCount++ + } } - if unreachCount != expectedUnreachCount { - // If the unreachable node count does not match the expected count, return an error - return fmt.Errorf("failed to %s the sbd daemon on the management node", startOrStop) + // Validate based on operation + if startOrStop == "stop" { + if unreachMgmtCount == 0 { + return fmt.Errorf("expected mgmt node to be unreachable after stop, found none") + } + logger.Info(t, fmt.Sprintf("Daemon stopped successfully. %d mgmt nodes are unreachable", unreachMgmtCount)) + } else { + if unreachMgmtCount > 0 { + return fmt.Errorf("expected all mgmt nodes to be reachable after start, but found %d unreachable", unreachMgmtCount) + } + logger.Info(t, "Daemon started successfully. All mgmt nodes are reachable.") } - // Log success if no errors occurred logger.Info(t, fmt.Sprintf("Daemon %s successfully", startOrStop)) return nil } @@ -396,7 +407,7 @@ func LSFRunJobs(t *testing.T, sClient *ssh.Client, jobCmd string, logger *utils. if err != nil { return err } - min = 300 + min + min = 720 + min jobMaxTimeout = time.Duration(min) * time.Second // Log the job output for debugging purposes @@ -472,103 +483,168 @@ func LSFExtractJobID(response string) (string, error) { // It sets a timeout and checks for disappearance until completion. Returns an error if the timeout is exceeded or if // there is an issue running the SSH command. func WaitForDynamicNodeDisappearance(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { - // Record the start time of the job execution + const ( + statusOK = "ok" + workerKeyword = "comp" + pollInterval = 90 * time.Second + ) + startTime := time.Now() + waitCount := 0 - // Continuously monitor the dynamic node until it disappears or the timeout occurs for time.Since(startTime) < timeOutForDynamicNodeDisappear { - // Run the 'bhosts -w' command on the remote SSH server - command := "bhosts -w" - output, err := utils.RunCommandInSSHSession(sClient, command) + output, err := utils.RunCommandInSSHSession(sClient, "bhosts -w") if err != nil { - return fmt.Errorf("failed to execute SSH command '%s': %w", command, err) + return fmt.Errorf("SSH command failed: %w", err) } - // Split the output into lines and process each line - lines := strings.Split(output, "\n") foundRelevantNode := false - for _, line := range lines { - // Check if the line contains "ok" and does not contain "-worker-" - if strings.Contains(line, "ok") && !strings.Contains(line, "-worker-") { + var activeNode string + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + if strings.Contains(line, statusOK) && !strings.Contains(line, workerKeyword) { foundRelevantNode = true - logger.Info(t, fmt.Sprintf("Relevant dynamic node still present: %s", line)) + activeNode = line break } } - if foundRelevantNode { - // Wait and retry if a relevant node is still present - time.Sleep(90 * time.Second) - } else { - // All relevant dynamic nodes have disappeared - logger.Info(t, "All relevant dynamic nodes have disappeared!") + if !foundRelevantNode { + logger.Info(t, "Cluster status: All dynamic nodes have been removed successfully") return nil } + + waitCount++ + elapsed := time.Since(startTime) + logger.Info(t, fmt.Sprintf("Monitoring: Node '%s' still active (elapsed: %.1f minutes)", + strings.Fields(activeNode)[0], // Extract just the node name + elapsed.Minutes())) + + time.Sleep(pollInterval) } - // Timeout exceeded while waiting for dynamic nodes to disappear - return fmt.Errorf("timeout of %s occurred while waiting for the dynamic node to disappear", timeOutForDynamicNodeDisappear.String()) + return fmt.Errorf("operation timed out after %.1f minutes: dynamic nodes still present", + timeOutForDynamicNodeDisappear.Minutes()) } -// LSFAPPCenterConfiguration performs configuration validation for the APP Center by checking the status of essential services -// (WEBGUI and PNC) and ensuring that the APP center port (8081) is actively listening. -// Returns an error if the validation encounters issues, otherwise, nil is returned. -// LSFAPPCenterConfiguration checks and validates the configuration of the LSF App Center. -// It verifies whether the APP Center GUI or PNC is configured correctly, -// if the APP center port is listening as expected, if the APP center binary is installed, -// and if MariaDB packages are installed as expected. -// Returns an error if any validation check fails, otherwise nil. - +// LSFAPPCenterConfiguration performs configuration validation for the APP Center by checking essential services +// (WEBGUI and PNC), port listening, installed binaries, and MariaDB packages. +// Returns an error if any validation check fails; otherwise, returns nil. func LSFAPPCenterConfiguration(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { lsfAppCenterPkg := "lsf-appcenter-10." + port := "8443" + //expectedStatus := "200 OK" - // Check the result of CheckAppCenterSetup for any errors + // 1. Validate App Center setup using pmcadmin if err := CheckAppCenterSetup(t, sClient, logger); err != nil { - // If there's an error, return it wrapped with a custom message - return fmt.Errorf("CheckAppCenterSetup pmcadmin list validation failed : %w", err) + return fmt.Errorf("CheckAppCenterSetup pmcadmin list validation failed: %w", err) } - // Command to check if APP center port is listening as expected - portStatusCommand := fmt.Sprintf("netstat -tuln | grep %d", appCenterPort) + // 2. Check if port is listening + portStatusCommand := fmt.Sprintf("netstat -tuln | grep %s", port) portStatusOutput, err := utils.RunCommandInSSHSession(sClient, portStatusCommand) if err != nil { return fmt.Errorf("failed to execute command '%s': %w", portStatusCommand, err) } - if !utils.VerifyDataContains(t, portStatusOutput, "LISTEN", logger) { - return fmt.Errorf("APP center port not listening as expected: %s", portStatusOutput) + return fmt.Errorf("APP Center port %s is not listening as expected: %s", port, portStatusOutput) } - // Command to check if APP center binary is installed as expected + // 3. Validate App Center binary is installed appBinaryCommand := "rpm -qa | grep lsf-appcenter" appBinaryOutput, err := utils.RunCommandInSSHSession(sClient, appBinaryCommand) if err != nil { return fmt.Errorf("failed to execute command '%s': %w", appBinaryCommand, err) } - if !utils.VerifyDataContains(t, appBinaryOutput, lsfAppCenterPkg, logger) { - return fmt.Errorf("APP center binary not installed as expected: %s", appBinaryOutput) + return fmt.Errorf("app Center binary not installed as expected: %s", appBinaryOutput) } - // Command to check if MariaDB packages are installed as expected - mariaDBCommand := "rpm -qa | grep MariaDB" - mariaDBOutput, err := utils.RunCommandInSSHSession(sClient, mariaDBCommand) + // 4. Define the command to check mariadb status + mariaDBCommand := "sudo su -l root -c 'systemctl status mariadb'" + + const expectedMessage = "Active: active (running)" + + // Run the systemctl command on the remote host + output, err := utils.RunCommandInSSHSession(sClient, mariaDBCommand) if err != nil { - return fmt.Errorf("failed to execute command '%s': %w", mariaDBCommand, err) + return fmt.Errorf("failed to run '%s': %w", mariaDBCommand, err) } - mariaDBPackages := [4]string{"MariaDB-client", "MariaDB-common", "MariaDB-shared", "MariaDB-server"} - for _, out := range mariaDBPackages { - if !utils.VerifyDataContains(t, mariaDBOutput, out, logger) { - return fmt.Errorf("MariaDB was not installed as expected binary: %s", mariaDBOutput) - } + // Check if the output contains the expected active message + if !utils.VerifyDataContains(t, string(output), expectedMessage, logger) { + return fmt.Errorf("mariadb health check failed: expected message '%s' not found in output:\n%s", expectedMessage, string(output)) } - // Log success if no errors occurred - logger.Info(t, "Appcenter configuration validated successfully") + + // 5. Validate web interface status via curl + // curlCommand := fmt.Sprintf("curl -i http://localhost:%s/platform/login | head -1", port) + // curlCommandOutput, err := utils.RunCommandInSSHSession(sClient, curlCommand) + // if err != nil { + // return fmt.Errorf("failed to execute command '%s': %w", curlCommand, err) + // } + // if !utils.VerifyDataContains(t, string(curlCommandOutput), expectedStatus, logger) { + // return fmt.Errorf("app Center did not return expected HTTP status '%s': %s", expectedStatus, curlCommandOutput) + // } + + logger.Info(t, "App Center configuration validated successfully") return nil } +// // LSFGETDynamicComputeNodeIPs retrieves the IP addresses of static nodes with a status of "ok" in an LSF cluster. +// // It excludes nodes containing "worker" in their HOST_NAME and processes the IP addresses from the node names. +// // The function executes the "bhosts -w" command over an SSH session, parses the output, and returns a sorted slice of IP addresses. +// // Returns: +// // - A sorted slice of IP addresses as []string. +// // - An error if the command execution or output parsing fails. +// func LSFGETDynamicComputeNodeIPs(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) ([]string, error) { +// const ( +// statusOK = "ok" +// workerKeyword = "worker" +// ) + +// // Run the "bhosts -w" command to get the node status +// nodeStatus, err := utils.RunCommandInSSHSession(sClient, "bhosts -w") +// if err != nil { +// return nil, fmt.Errorf("failed to execute 'bhosts' command: %w", err) +// } + +// var workerIPs []string + +// // Parse the command output +// scanner := bufio.NewScanner(strings.NewReader(nodeStatus)) +// for scanner.Scan() { +// fields := strings.Fields(scanner.Text()) + +// // Ensure fields exist and match the required conditions +// if len(fields) > 1 && fields[1] == statusOK && !strings.Contains(fields[0], workerKeyword) { +// // Extract the IP address from the HOST_NAME (expected format: -) +// parts := strings.Split(fields[0], "-") +// if len(parts) >= 4 { // Ensure enough segments exist +// ip := strings.Join(parts[len(parts)-4:], ".") +// workerIPs = append(workerIPs, ip) +// } +// } +// } + +// // Check for scanning errors +// if err := scanner.Err(); err != nil { +// return nil, fmt.Errorf("error scanning node status: %w", err) +// } + +// // Sort the IP addresses +// sort.Strings(workerIPs) + +// // Log the retrieved IPs +// logger.Info(t, fmt.Sprintf("Retrieved Worker IPs: %v", workerIPs)) + +// return workerIPs, nil +// } + // LSFGETDynamicComputeNodeIPs retrieves the IP addresses of static nodes with a status of "ok" in an LSF cluster. // It excludes nodes containing "worker" in their HOST_NAME and processes the IP addresses from the node names. // The function executes the "bhosts -w" command over an SSH session, parses the output, and returns a sorted slice of IP addresses. @@ -578,7 +654,7 @@ func LSFAPPCenterConfiguration(t *testing.T, sClient *ssh.Client, logger *utils. func LSFGETDynamicComputeNodeIPs(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) ([]string, error) { const ( statusOK = "ok" - workerKeyword = "worker" + workerKeyword = "-comp-" ) // Run the "bhosts -w" command to get the node status @@ -614,7 +690,7 @@ func LSFGETDynamicComputeNodeIPs(t *testing.T, sClient *ssh.Client, logger *util sort.Strings(workerIPs) // Log the retrieved IPs - logger.Info(t, fmt.Sprintf("Retrieved Worker IPs: %v", workerIPs)) + logger.Info(t, fmt.Sprintf("Retrieved Dynamic Worker IPs: %v", workerIPs)) return workerIPs, nil } @@ -655,7 +731,7 @@ func HPCGETDynamicComputeNodeIPs(t *testing.T, sClient *ssh.Client, logger *util sort.Strings(workerIPs) // Log the retrieved IPs - logger.Info(t, fmt.Sprintf("Retrieved Worker IPs: %v", workerIPs)) + logger.Info(t, fmt.Sprintf("Retrieved Dynamic Worker IPs: %v", workerIPs)) return workerIPs, nil } @@ -924,29 +1000,33 @@ func LSFCheckSSHKeyForComputeNodes(t *testing.T, sClient *ssh.Client, computeNod return nil } -// CheckLSFVersion verifies if the IBM Spectrum LSF version on the cluster matches the expected version. -// It executes the 'lsid' command, retrieves the cluster ID, and compares it with the expected version. -func CheckLSFVersion(t *testing.T, sClient *ssh.Client, expectedVersion string, logger *utils.AggregatedLogger) error { - - // Execute the 'lsid' command to get the cluster ID +// CheckLSFVersion verifies that the IBM Spectrum LSF version on the cluster +// matches the expected Fixpack version by running the 'lsid' command. +func CheckLSFVersion(t *testing.T, sClient *ssh.Client, lsfVersion string, logger *utils.AggregatedLogger) error { command := LOGIN_NODE_EXECUTION_PATH + "lsid" output, err := utils.RunCommandInSSHSession(sClient, command) if err != nil { - // Handle the error when executing the 'lsid' command return fmt.Errorf("failed to execute 'lsid' command: %w", err) } - // Verify if the expected cluster ID is present in the output - if !utils.VerifyDataContains(t, output, "IBM Spectrum LSF Standard "+expectedVersion, logger) { - // Extract actual cluster version from the output for better error reporting - actualValue := strings.TrimSpace(strings.Split(strings.Split(output, "IBM Spectrum LSF Standard")[1], ", ")[0]) + var expectedVersion string + switch lsfVersion { + case LSFVersion14: + expectedVersion = LSF_VERSION_FP14 + case LSFVersion15: + expectedVersion = LSF_VERSION_FP15 + default: + return fmt.Errorf("unsupported LSF version identifier: %s", lsfVersion) + } + + expectedString := "IBM Spectrum LSF " + expectedVersion + if !utils.VerifyDataContains(t, output, expectedString, logger) { + actualValue := strings.TrimSpace(strings.Split(strings.Split(output, "IBM Spectrum LSF")[1], ", ")[0]) return fmt.Errorf("expected cluster Version %s, but found %s", expectedVersion, actualValue) } - // Log information when the cluster version is set as expected logger.Info(t, fmt.Sprintf("Cluster Version is set as expected: %s", expectedVersion)) - // No errors occurred return nil } @@ -1002,11 +1082,11 @@ func GetOSNameOfNode(t *testing.T, sClient *ssh.Client, hostIP string, logger *u return "", parseErr } -// HPCCheckFileMount checks if essential LSF directories (conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work) exist +// CheckFileMount checks if essential LSF directories ("gui", "lsf", "perf", "ppm", and "ssh",) exist // on remote machines identified by the provided list of IP addresses. It utilizes SSH to // query and validate the directories. Any missing directory triggers an error, and the // function logs the success message if all directories are found. -func HPCCheckFileMount(t *testing.T, sClient *ssh.Client, ipsList []string, nodeType string, logger *utils.AggregatedLogger) error { +func CheckFileMount(t *testing.T, sClient *ssh.Client, ipsList []string, nodeType string, logger *utils.AggregatedLogger) error { // Define constants const ( sampleText = "Welcome to the ibm cloud HPC" @@ -1031,12 +1111,12 @@ func HPCCheckFileMount(t *testing.T, sClient *ssh.Client, ipsList []string, node // Check if it's not a login node if !(strings.Contains(strings.ToLower(nodeType), "login")) { // Define expected file system mounts - expectedMount := []string{"/mnt/lsf", "/mnt/vpcstorage/tools", "/mnt/vpcstorage/data"} + expectedMounts := []string{"/mnt/lsf", "/mnt/vpcstorage/tools", "/mnt/vpcstorage/data"} // Check if all expected mounts exist - for _, mount := range expectedMount { + for _, mount := range expectedMounts { if !utils.VerifyDataContains(t, actualMount, mount, logger) { - return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node IP '%s'", actualMount, expectedMount, ip) + return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node IP '%s'", actualMount, expectedMounts, ip) } } @@ -1049,22 +1129,22 @@ func HPCCheckFileMount(t *testing.T, sClient *ssh.Client, ipsList []string, node } // Create, read, verify and delete sample files in each mount - for i := 1; i < len(expectedMount); i++ { + for i := 1; i < len(expectedMounts); i++ { // Create file - _, fileCreationErr := utils.ToCreateFileWithContent(t, sClient, expectedMount[i], SampleFileName, sampleText, logger) + _, fileCreationErr := utils.ToCreateFileWithContent(t, sClient, expectedMounts[i], SampleFileName, sampleText, logger) if fileCreationErr != nil { - return fmt.Errorf("failed to create file on %s for machine IP %s: %w", expectedMount[i], ip, fileCreationErr) + return fmt.Errorf("failed to create file on %s for machine IP %s: %w", expectedMounts[i], ip, fileCreationErr) } // Read file - actualText, fileReadErr := utils.ReadRemoteFileContents(t, sClient, expectedMount[i], SampleFileName, logger) + actualText, fileReadErr := utils.ReadRemoteFileContents(t, sClient, expectedMounts[i], SampleFileName, logger) if fileReadErr != nil { // Delete file if reading fails - _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMount[i], SampleFileName, logger) + _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMounts[i], SampleFileName, logger) if fileDeletionErr != nil { return fmt.Errorf("failed to delete %s file on machine IP %s: %w", SampleFileName, ip, fileDeletionErr) } - return fmt.Errorf("failed to read %s file content on %s machine IP %s: %w", SampleFileName, expectedMount[i], ip, fileReadErr) + return fmt.Errorf("failed to read %s file content on %s machine IP %s: %w", SampleFileName, expectedMounts[i], ip, fileReadErr) } // Verify file content @@ -1073,18 +1153,18 @@ func HPCCheckFileMount(t *testing.T, sClient *ssh.Client, ipsList []string, node } // Delete file after verification - _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMount[i], SampleFileName, logger) + _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMounts[i], SampleFileName, logger) if fileDeletionErr != nil { return fmt.Errorf("failed to delete %s file on machine IP %s: %w", SampleFileName, ip, fileDeletionErr) } } } else { // For login nodes, only /mnt/lsf is checked - expectedMount := "/mnt/lsf" + loginNodeMountPath := "/mnt/lsf" // Verify /mnt/lsf existence - if !utils.VerifyDataContains(t, actualMount, expectedMount, logger) { - return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node IP '%s'", actualMount, expectedMount, ip) + if !utils.VerifyDataContains(t, actualMount, loginNodeMountPath, logger) { + return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node IP '%s'", actualMount, loginNodeMountPath, ip) } // Log /mnt/lsf existence @@ -1119,18 +1199,15 @@ func verifyDirectories(t *testing.T, sClient *ssh.Client, ip string, logger *uti switch { case utils.IsStringInSlice(actualDirs, "openldap"): expectedDirs = []string{ - "conf", "config_done", "das_staging_area", "data", - "gui-logs", "log", "openldap", "repository-path", "work", + "das_staging_area", "data", "gui", "logs", "lsf", "openldap", "perf", "ppm", "repository-path", "ssh", } case utils.IsStringInSlice(actualDirs, "pac"): expectedDirs = []string{ - "conf", "config_done", "das_staging_area", "data", - "gui-logs", "log", "lsf_packages", "pac", "repository-path", "work", + "das_staging_area", "data", "gui", "logs", "lsf", "perf", "ppm", "repository-path", "ssh", } default: expectedDirs = []string{ - "conf", "config_done", "das_staging_area", "data", - "gui-logs", "log", "repository-path", "work", + "das_staging_area", "data", "gui", "logs", "lsf", "perf", "ppm", "repository-path", "ssh", } } @@ -1139,15 +1216,13 @@ func verifyDirectories(t *testing.T, sClient *ssh.Client, ip string, logger *uti return fmt.Errorf("actual directory '%v' does not match the expected directory '%v' for node IP '%s'", actualDirs, expectedDirs, ip) } - // Log directories existence - logger.Info(t, fmt.Sprintf("Directories [10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work] exist on %s", ip)) return nil } -// VerifyTerraformOutputs verifies specific fields in the Terraform outputs and ensures they are not empty based on the provided LastTestTerraformOutputs. +// HPCVerifyTerraformOutputs verifies specific fields in the Terraform outputs and ensures they are not empty based on the provided LastTestTerraformOutputs. // Additional checks are performed for the application center and LDAP server based on the isAPPCenterEnabled and ldapServerEnabled flags. // Any missing essential field results in an error being returned with detailed information. -func VerifyTerraformOutputs(t *testing.T, LastTestTerraformOutputs map[string]interface{}, isAPPCenterEnabled, ldapServerEnabled bool, logger *utils.AggregatedLogger) error { +func HPCVerifyTerraformOutputs(t *testing.T, LastTestTerraformOutputs map[string]interface{}, isAPPCenterEnabled, ldapServerEnabled bool, logger *utils.AggregatedLogger) error { fields := []string{"ssh_to_management_node", "ssh_to_login_node", "vpc_name", "region_name"} actualOutput := make(map[string]interface{}) @@ -1183,57 +1258,58 @@ func VerifyTerraformOutputs(t *testing.T, LastTestTerraformOutputs map[string]in } -// LSFCheckSSHConnectivityToNodesFromLogin checks SSH connectivity from the login node to other nodes. -func LSFCheckSSHConnectivityToNodesFromLogin(t *testing.T, sshLoginClient *ssh.Client, managementNodeIPList, computeNodeIPList []string, logger *utils.AggregatedLogger) error { +// LSFCheckSSHConnectivityToNodesFromLogin verifies SSH connectivity from the login node +// to all other nodes in the cluster (management and compute). +func LSFCheckSSHConnectivityToNodesFromLogin(t *testing.T, sshLoginClient *ssh.Client, managementNodeIPList, computeNodeIPList []string, logger *utils.AggregatedLogger, +) error { // Check if management node IP list is empty if len(managementNodeIPList) == 0 { - return fmt.Errorf("ERROR: management node IPs cannot be empty") + return fmt.Errorf("management node IP list cannot be empty") } - // Iterate over each management node IP in the list + // Check connectivity to each management node for _, managementNodeIP := range managementNodeIPList { - // Run SSH command to get the hostname of the management node command := fmt.Sprintf("ssh %s 'hostname'", managementNodeIP) actualOutput, err := utils.RunCommandInSSHSession(sshLoginClient, command) if err != nil { - return fmt.Errorf("failed to run SSH command on management node IP %s: %w", managementNodeIP, err) + return fmt.Errorf("failed to SSH from login node to management node IP %s: %w", managementNodeIP, err) } - // Check if the hostname contains "mgmt" substring if !utils.VerifyDataContains(t, actualOutput, "mgmt", logger) { - return fmt.Errorf("compute node '%v' does not contain 'mgmt' substring for node IP '%s'", actualOutput, managementNodeIP) + return fmt.Errorf("hostname '%v' does not contain 'mgmt' for management node IP '%s'", actualOutput, managementNodeIP) } } // Check if compute node IP list is empty if len(computeNodeIPList) == 0 { - return fmt.Errorf("ERROR: compute node IPs cannot be empty") + return fmt.Errorf("compute node IP list cannot be empty") } - // Iterate over each compute node IP in the list + + // Check connectivity to each compute node for _, computeNodeIP := range computeNodeIPList { - // Run a simple SSH command to check connectivity command := fmt.Sprintf("ssh -o ConnectTimeout=12 -q %s exit", computeNodeIP) _, err := utils.RunCommandInSSHSession(sshLoginClient, command) if err != nil { - return fmt.Errorf("failed to run SSH command on compute node IP %s: %w", computeNodeIP, err) + return fmt.Errorf("failed to SSH from login node to compute node IP %s: %w", computeNodeIP, err) } - } - // Log success if no errors occurred - logger.Info(t, "SSH connectivity check from login node to other nodes completed successfully") - // No errors occurred + + // Log success + logger.Info(t, "SSH connectivity check from login node to management and compute nodes completed successfully") return nil } -// HPCCheckNoVNC checks if NO VNC is properly configured on a remote machine. +// LSFCheckNoVNC checks if NO VNC is properly configured on a remote machine. // It executes a series of commands via SSH and verifies the expected output. -func HPCCheckNoVNC(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { +func LSFCheckNoVNC(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { // Define commands to be executed and their expected outputs commands := map[string]string{ - "rpm -qa | grep xterm": "xterm", - "rpm -qa | grep tigervnc": "tigervnc", - "ps aux | grep -i novnc": "-Ddefault.novnc.port=6080", - "netstat -tuln | grep 6080": "0.0.0.0:6080", + "rpm -qa | grep xterm": "xterm", + "rpm -qa | grep tigervnc": "tigervnc", + "ps aux | grep -i novnc": "-Ddefault.novnc.port=6080", + "netstat -tuln | grep 6080": "0.0.0.0:6080", + "curl -sI http://localhost:6080 | head -1": "200 OK", + "which vncserver || command -v vncserver": "vncserver", } // Iterate over commands @@ -1311,7 +1387,7 @@ func VerifyEncryption(t *testing.T, apiKey, region, resourceGroup, clusterPrefix } // // Retrieve the list of file shares (retry once after 2s if it fails) - fileSharesOutput, err := utils.RunCommandWithRetry(fileSharesCmd, 1, 60*time.Second) + fileSharesOutput, err := utils.RunCommandWithRetry(fileSharesCmd, 3, 60*time.Second) if err != nil { return fmt.Errorf("failed to retrieve file shares: %w", err) } @@ -1321,6 +1397,7 @@ func VerifyEncryption(t *testing.T, apiKey, region, resourceGroup, clusterPrefix for _, fileShareName := range fileShareNames { fileShareCmd := exec.Command("ibmcloud", "is", "share", fileShareName) + output, err := fileShareCmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to retrieve file share details for '%s': %w", fileShareName, err) @@ -1328,13 +1405,14 @@ func VerifyEncryption(t *testing.T, apiKey, region, resourceGroup, clusterPrefix if !utils.VerifyDataContains(t, strings.ToLower(keyManagement), "key_protect", logger) { if !utils.VerifyDataContains(t, string(output), "provider_managed", logger) { - return fmt.Errorf("encryption in transit is unexpectedly enabled for the file shares ") + return fmt.Errorf("encryption-in-transit is unexpectedly enabled for the file shares") } } else { if !utils.VerifyDataContains(t, string(output), "user_managed", logger) { - return fmt.Errorf("encryption in transit is unexpectedly disabled for the file shares") + return fmt.Errorf("encryption-in-transit is unexpectedly disabled for the file shares") } } + } logger.Info(t, "Encryption set as expected") return nil @@ -1431,11 +1509,11 @@ func LSFRunJobsAsLDAPUser(t *testing.T, sClient *ssh.Client, jobCmd, ldapUser st return fmt.Errorf("job execution for ID %s exceeded the specified time", jobID) } -// HPCCheckFileMountAsLDAPUser checks if essential LSF directories (conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, openldap, repository-path and work) exist +// CheckFileMountAsLDAPUser checks if essential LSF directories (conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, openldap, repository-path and work) exist // on remote machines It utilizes SSH to // query and validate the directories. Any missing directory triggers an error, and the // function logs the success message if all directories are found. -func HPCCheckFileMountAsLDAPUser(t *testing.T, sClient *ssh.Client, nodeType string, logger *utils.AggregatedLogger) error { +func CheckFileMountAsLDAPUser(t *testing.T, sClient *ssh.Client, nodeType string, logger *utils.AggregatedLogger) error { // Define constants const ( sampleText = "Welcome to the ibm cloud HPC" @@ -1455,10 +1533,10 @@ func HPCCheckFileMountAsLDAPUser(t *testing.T, sClient *ssh.Client, nodeType str actualMount := strings.TrimSpace(string(outputOne)) if !strings.Contains(strings.ToLower(nodeType), "login") { - expectedMount := []string{"/mnt/lsf", "/mnt/vpcstorage/tools", "/mnt/vpcstorage/data"} - for _, mount := range expectedMount { + expectedMounts := []string{"/mnt/lsf", "/mnt/vpcstorage/tools", "/mnt/vpcstorage/data"} + for _, mount := range expectedMounts { if !utils.VerifyDataContains(t, actualMount, mount, logger) { - return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node %s", actualMount, expectedMount, hostname) + return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node %s", actualMount, expectedMounts, hostname) } } logger.Info(t, fmt.Sprintf("Filesystems [/mnt/lsf, /mnt/vpcstorage/tools,/mnt/vpcstorage/data] exist on the node %s", hostname)) @@ -1467,34 +1545,34 @@ func HPCCheckFileMountAsLDAPUser(t *testing.T, sClient *ssh.Client, nodeType str return err } - for i := 1; i < len(expectedMount); i++ { - _, fileCreationErr := utils.ToCreateFileWithContent(t, sClient, expectedMount[i], SampleFileName, sampleText, logger) + for i := 1; i < len(expectedMounts); i++ { + _, fileCreationErr := utils.ToCreateFileWithContent(t, sClient, expectedMounts[i], SampleFileName, sampleText, logger) if fileCreationErr != nil { - return fmt.Errorf("failed to create file on %s for machine %s: %w", expectedMount[i], hostname, fileCreationErr) + return fmt.Errorf("failed to create file on %s for machine %s: %w", expectedMounts[i], hostname, fileCreationErr) } - actualText, fileReadErr := utils.ReadRemoteFileContents(t, sClient, expectedMount[i], SampleFileName, logger) + actualText, fileReadErr := utils.ReadRemoteFileContents(t, sClient, expectedMounts[i], SampleFileName, logger) if fileReadErr != nil { - _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMount[i], SampleFileName, logger) + _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMounts[i], SampleFileName, logger) if fileDeletionErr != nil { return fmt.Errorf("failed to delete %s file on machine %s: %w", SampleFileName, hostname, fileDeletionErr) } - return fmt.Errorf("failed to read %s file content on %s machine %s: %w", SampleFileName, expectedMount[i], hostname, fileReadErr) + return fmt.Errorf("failed to read %s file content on %s machine %s: %w", SampleFileName, expectedMounts[i], hostname, fileReadErr) } if !utils.VerifyDataContains(t, actualText, sampleText, logger) { return fmt.Errorf("%s actual file content '%v' does not match the file content '%v' for node %s", SampleFileName, actualText, sampleText, hostname) } - _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMount[i], SampleFileName, logger) + _, fileDeletionErr := utils.ToDeleteFile(t, sClient, expectedMounts[i], SampleFileName, logger) if fileDeletionErr != nil { return fmt.Errorf("failed to delete %s file on machine %s: %w", SampleFileName, hostname, fileDeletionErr) } } } else { - expectedMount := "/mnt/lsf" - if !utils.VerifyDataContains(t, actualMount, expectedMount, logger) { - return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node %s", actualMount, expectedMount, hostname) + loginNodeMountPath := "/mnt/lsf" + if !utils.VerifyDataContains(t, actualMount, loginNodeMountPath, logger) { + return fmt.Errorf("actual filesystem '%v' does not match the expected filesystem '%v' for node %s", actualMount, loginNodeMountPath, hostname) } logger.Info(t, fmt.Sprintf("Filesystems /mnt/lsf exist on the node %s", hostname)) @@ -1519,15 +1597,13 @@ func verifyDirectoriesAsLdapUser(t *testing.T, sClient *ssh.Client, hostname str actualDirs := strings.Fields(strings.TrimSpace(string(outputTwo))) // Define expected directories - expectedDirs := []string{"conf", "config_done", "das_staging_area", "data", "gui-logs", "log", "openldap", "repository-path", "work"} + expectedDirs := []string{"das_staging_area", "data", "gui", "logs", "lsf", "openldap", "perf", "ppm", "repository-path", "ssh"} // Verify if all expected directories exist if !utils.VerifyDataContains(t, actualDirs, expectedDirs, logger) { return fmt.Errorf("actual directory '%v' does not match the expected directory '%v' for node IP '%s'", actualDirs, expectedDirs, hostname) } - // Log directories existence - logger.Info(t, fmt.Sprintf("Directories [10.1, conf, config_done, das_staging_area, data, gui-conf, gui-logs, log, repository-path and work] exist on %s", hostname)) return nil } @@ -1648,7 +1724,7 @@ func VerifyLDAPServerConfig(t *testing.T, sClient *ssh.Client, ldapAdminpassword if err != nil { return fmt.Errorf("failed to execute command '%s' via SSH: %v", ldapConfigCheckCmd, err) } - expected := fmt.Sprintf("BASE dc=%s,dc=%s", strings.Split(ldapDomain, ".")[0], strings.Split(ldapDomain, ".")[1]) + expected := fmt.Sprintf("BASE dc=%s,dc=%s", strings.Split(ldapDomain, ".")[0], strings.Split(ldapDomain, ".")[1]) if !utils.VerifyDataContains(t, actual, expected, logger) { return fmt.Errorf("LDAP configuration check failed: Expected '%s', got '%s'", expected, actual) } @@ -1685,9 +1761,9 @@ func VerifyLDAPServerConfig(t *testing.T, sClient *ssh.Client, ldapAdminpassword return nil } -// verifyPTRRecords verifies PTR records for 'mgmt' or 'login' nodes and ensures their resolution via SSH. +// verifyPTRRecords verifies PTR records for 'mgmt' nodes and ensures their resolution via SSH. // It retrieves hostnames, performs nslookup to verify PTR records, and returns an error if any step fails. -func verifyPTRRecords(t *testing.T, sClient *ssh.Client, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, loginNodeIP string, domainName string, logger *utils.AggregatedLogger) error { +func verifyPTRRecords(t *testing.T, sClient *ssh.Client, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, domainName string, logger *utils.AggregatedLogger) error { // Slice to hold the list of hostnames var hostNamesList []string @@ -1751,25 +1827,6 @@ func verifyPTRRecords(t *testing.T, sClient *ssh.Client, publicHostName, publicH } logger.Info(t, "Verify PTR Records for management nodes completed successfully.") - // If login node IP is provided, verify PTR records on login node as well - if loginNodeIP != "" { - loginSshClient, connectionErr := utils.ConnectToHost(publicHostName, publicHostIP, privateHostName, loginNodeIP) - if connectionErr != nil { - return fmt.Errorf("failed to connect to the login node %s via SSH: %v", loginNodeIP, connectionErr) - } - - defer func() { - if err := loginSshClient.Close(); err != nil { - logger.Info(t, fmt.Sprintf("failed to close loginSshClient: %v", err)) - } - }() - - // Verify PTR records on login node - if err := verifyPTR(loginSshClient, fmt.Sprintf("login node %s", loginNodeIP)); err != nil { - return err - } - } - logger.Info(t, "Verify PTR Records for login node completed successfully.") logger.Info(t, "Verify PTR Records completed successfully.") return nil @@ -1839,6 +1896,14 @@ func DeleteServiceInstance(t *testing.T, apiKey, region, resourceGroup, instance logger.Info(t, fmt.Sprintf("Service instance '%s' retrieved successfully. Instance ID: %s", instanceName, serviceInstanceID)) + // Set the IBM Cloud Key Protect region + setKPRegionCommand := fmt.Sprintf("ibmcloud kp region-set %s", region) + setKPRegionExec := exec.Command("bash", "-c", setKPRegionCommand) + setKPRegionOutput, err := setKPRegionExec.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to set Key Protect region: %w. Output: %s", err, string(setKPRegionOutput)) + } + // Retrieve and delete associated keys getAssociatedKeysCmd := fmt.Sprintf("ibmcloud kp keys -i %s | awk 'NR>3' | awk '{print $1}'", serviceInstanceID) cmdKeysID := exec.Command("bash", "-c", getAssociatedKeysCmd) @@ -1906,11 +1971,18 @@ func CreateKey(t *testing.T, apiKey, region, resourceGroup, instanceName, keyNam logger.Info(t, fmt.Sprintf("Service instance '%s' retrieved successfully. Instance ID: %s", instanceName, serviceInstanceID)) - // Create key + // Set the IBM Cloud Key Protect region + setKPRegionCommand := fmt.Sprintf("ibmcloud kp region-set %s", region) + setKPRegionExec := exec.Command("bash", "-c", setKPRegionCommand) + setKPRegionOutput, err := setKPRegionExec.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to set Key Protect region: %w. Output: %s", err, string(setKPRegionOutput)) + } + // Create key createKeyCmd := fmt.Sprintf("ibmcloud kp key create %s -i %s", keyName, serviceInstanceID) - cmdKey := exec.Command("bash", "-c", createKeyCmd) - keyOutput, err := cmdKey.CombinedOutput() + createCmdKey := exec.Command("bash", "-c", createKeyCmd) + keyOutput, err := createCmdKey.CombinedOutput() if err != nil { return fmt.Errorf("failed to create key: %w. Output: %s", err, string(keyOutput)) } @@ -1994,68 +2066,85 @@ func LSFDNSCheck(t *testing.T, sClient *ssh.Client, ipsList []string, domain str return nil } -// HPCAddNewLDAPUser adds a new LDAP user by modifying an existing user's configuration and running necessary commands. -// It reads the existing LDAP user configuration, replaces the existing user information with the new LDAP user -// information, creates a new LDIF file on the LDAP server, and then runs LDAP commands to add the new user. Finally, it -// verifies the addition of the new LDAP user by searching the LDAP server. -// Returns an error if the if any step fails -func HPCAddNewLDAPUser(t *testing.T, sClient *ssh.Client, ldapAdminPassword, ldapDomain, ldapUser, newLdapUser string, logger *utils.AggregatedLogger) error { - // Define the command to read the existing LDAP user configuration - getLDAPUserConf := "cat /opt/users.ldif" - actual, err := utils.RunCommandInSSHSession(sClient, getLDAPUserConf) +// LSFAddNewLDAPUser creates a new user in LDAP via SSH connection. +// Generates password hash, prepares LDIF configuration, and executes +// remote ldapadd command. Verifies success by searching for the new user. +// Returns nil on success or error if any operation fails. +// Domain must be in "dc1.dc2" format. +func LSFAddNewLDAPUser(t *testing.T, sClient *ssh.Client, ldapAdminPassword, ldapDomain, ldapUser, newLdapUser, newLdapPassword string, logger *utils.AggregatedLogger) error { + // Step 1: Parse the LDAP domain + domainParts := strings.Split(ldapDomain, ".") + if len(domainParts) != 2 { + return fmt.Errorf("invalid LDAP domain format: %s", ldapDomain) + } + dc1, dc2 := domainParts[0], domainParts[1] + + // Step 2: Fetch existing user LDIF + getLDAPUserConf := fmt.Sprintf(`ldapsearch -x -D "cn=admin,dc=%s,dc=%s" -w '%s' -b "ou=people,dc=%s,dc=%s" "(uid=%s)" | awk '/^dn: uid=/{flag=1} /^# search result/{flag=0} flag' > newuser.ldif`, + dc1, dc2, ldapAdminPassword, dc1, dc2, ldapUser) + _, err := utils.RunCommandInSSHSession(sClient, getLDAPUserConf) if err != nil { - return fmt.Errorf("failed to execute command '%s' via SSH: %v", getLDAPUserConf, err) + return fmt.Errorf("failed to run ldapsearch: %v", err) } - // Replace the existing LDAP user name with the new LDAP user name - ldifContent := strings.ReplaceAll(actual, ldapUser, newLdapUser) + // Step 3: Read the original LDIF + originalLDIF, err := utils.RunCommandInSSHSession(sClient, "cat ./newuser.ldif") + if err != nil { + return fmt.Errorf("failed to read LDIF: %v", err) + } + if strings.TrimSpace(originalLDIF) == "" { + return fmt.Errorf("no LDIF content found for user %s", ldapUser) + } - ldifContent = strings.ReplaceAll(ldifContent, "10000", "20000") + // Step 4: Replace username and UID + updatedLDIF := strings.ReplaceAll(originalLDIF, ldapUser, newLdapUser) + updatedLDIF = strings.ReplaceAll(updatedLDIF, "uidNumber: 10000", "uidNumber: 20000") - // Create the new LDIF file on the LDAP server - _, fileCreationErr := utils.ToCreateFileWithContent(t, sClient, ".", "user2.ldif", ldifContent, logger) - if fileCreationErr != nil { - return fmt.Errorf("failed to create file on LDAP server: %w", fileCreationErr) + // Generate password hash + hashedPass, err := utils.GenerateLDAPPasswordHash(t, sClient, newLdapPassword, logger) + if err != nil { + return fmt.Errorf("password hash generation failed: %w", err) } - // Parse the LDAP domain for reuse - domainParts := strings.Split(ldapDomain, ".") - if len(domainParts) != 2 { - return fmt.Errorf("invalid LDAP domain format: %s", ldapDomain) + lines := strings.Split(updatedLDIF, "\n") + for i, line := range lines { + if strings.HasPrefix(line, "userPassword:") || strings.HasPrefix(line, "userPassword::") { // pragma: allowlist secret + lines[i] = "userPassword: " + hashedPass // pragma: allowlist secret + } } - dc1, dc2 := domainParts[0], domainParts[1] + updatedLDIF = strings.Join(lines, "\n") - // Define the command to add the new LDAP user using the ldapadd command - ldapAddCmd := fmt.Sprintf( - "ldapadd -x -D cn=admin,dc=%s,dc=%s -w %s -f user2.ldif", - dc1, dc2, ldapAdminPassword, - ) - ldapAddOutput, err := utils.RunCommandInSSHSession(sClient, ldapAddCmd) + // Step 6: Write the updated LDIF to file using heredoc + heredoc := fmt.Sprintf(`cat < ./user2.ldif +%s +EOF`, updatedLDIF) + _, err = utils.RunCommandInSSHSession(sClient, heredoc) if err != nil { - return fmt.Errorf("failed to execute command '%s' via SSH: %v", ldapAddCmd, err) + return fmt.Errorf("failed to write user2.ldif via heredoc: %v", err) } - // Verify the new LDAP user exists in the search results - if !utils.VerifyDataContains(t, ldapAddOutput, "uid="+newLdapUser, logger) { - return fmt.Errorf("LDAP user %s not found in add command output", newLdapUser) + // ➕ Step 7: Add the new LDAP user + ldapAddCmd := fmt.Sprintf("ldapadd -x -D cn=admin,dc=%s,dc=%s -w '%s' -f user2.ldif", dc1, dc2, ldapAdminPassword) + ldapAddOutput, err := utils.RunCommandInSSHSession(sClient, ldapAddCmd) + if err != nil { + return fmt.Errorf("ldapadd failed: %v", err) + } + if !utils.VerifyDataContains(t, ldapAddOutput, "adding new entry", logger) { + return fmt.Errorf("ldapadd did not confirm user addition: %s", ldapAddOutput) } - // Define the command to search for the new LDAP user to verify the addition - ldapSearchCmd := fmt.Sprintf( - "ldapsearch -x -D \"cn=admin,dc=%s,dc=%s\" -w %s -b \"ou=people,dc=%s,dc=%s\" -s sub \"(objectClass=*)\"", - dc1, dc2, ldapAdminPassword, dc1, dc2, - ) + // Step 8: Verify the new user + ldapSearchCmd := fmt.Sprintf(`ldapsearch -x -D "cn=admin,dc=%s,dc=%s" -w '%s' -b "ou=people,dc=%s,dc=%s" "(uid=%s)"`, + dc1, dc2, ldapAdminPassword, dc1, dc2, newLdapUser) ldapSearchOutput, err := utils.RunCommandInSSHSession(sClient, ldapSearchCmd) if err != nil { - return fmt.Errorf("failed to execute command '%s' via SSH: %v", ldapSearchCmd, err) + return fmt.Errorf("ldapsearch verification failed: %v", err) } - - // Verify the new LDAP user exists in the search results if !utils.VerifyDataContains(t, ldapSearchOutput, "uid: "+newLdapUser, logger) { return fmt.Errorf("LDAP user %s not found in search results", newLdapUser) } - logger.Info(t, fmt.Sprintf("New LDAP user %s created successfully", newLdapUser)) + logger.Info(t, fmt.Sprintf("✅ New LDAP user '%s' created successfully", newLdapUser)) return nil } @@ -2110,7 +2199,8 @@ func ValidateFlowLogs(t *testing.T, apiKey, region, resourceGroup, clusterPrefix if err := utils.LoginIntoIBMCloudUsingCLI(t, apiKey, region, resourceGroup); err != nil { return fmt.Errorf("failed to log in to IBM Cloud: %w", err) } - flowLogName := fmt.Sprintf("%s-lsf-vpc", clusterPrefix) + //flowLogName := fmt.Sprintf("%s-lsf-vpc", clusterPrefix) + flowLogName := fmt.Sprintf("%s-lsf-logs", clusterPrefix) // Fetching the flow log details retrieveFlowLogs := fmt.Sprintf("ibmcloud is flow-logs %s", flowLogName) cmdRetrieveFlowLogs := exec.Command("bash", "-c", retrieveFlowLogs) @@ -2178,177 +2268,252 @@ func GetLDAPServerCert(publicHostName, bastionIP, ldapHostName, ldapServerIP str return ldapServerCert, nil } -// GetClusterInfo retrieves key cluster-related information from Terraform variables. -// It extracts the cluster ID, reservation ID, and cluster prefix from the provided test options. -// Returns the cluster ID, reservation ID, and cluster prefix as strings. -func GetClusterInfo(options *testhelper.TestOptions) (string, string, string) { - var ClusterName, reservationID, clusterPrefix string - - // Retrieve values safely with type assertion - if id, ok := options.TerraformVars["cluster_name"].(string); ok { - ClusterName = id - } - if reservation, ok := options.TerraformVars["reservation_id"].(string); ok { - reservationID = reservation - } +// // GetClusterInfo retrieves key cluster-related information from Terraform variables. +// // It extracts the cluster ID, reservation ID, and cluster prefix from the provided test options. +// // Returns the cluster ID, reservation ID, and cluster prefix as strings. +// func GetClusterInfo(options *testhelper.TestOptions) (string, string, string) { +// var ClusterName, reservationID, clusterPrefix string + +// // Retrieve values safely with type assertion +// if id, ok := options.TerraformVars["cluster_name"].(string); ok { +// ClusterName = id +// } +// if reservation, ok := options.TerraformVars["reservation_id"].(string); ok { +// reservationID = reservation +// } +// if prefix, ok := options.TerraformVars["cluster_prefix"].(string); ok { +// clusterPrefix = prefix +// } + +// return ClusterName, reservationID, clusterPrefix +// } + +// // GetClusterInfo extracts key cluster-related information from Terraform variables. +// // It returns the cluster name and cluster prefix as strings. +// func GetClusterInfo(options *testhelper.TestOptions) (clusterName string, clusterPrefix string) { +// // Retrieve the cluster name if present and of type string +// if name, ok := options.TerraformVars["cluster_name"].(string); ok { +// clusterName = name +// } + +// // Retrieve the cluster prefix if present and of type string +// if prefix, ok := options.TerraformVars["cluster_prefix"].(string); ok { +// clusterPrefix = prefix +// } + +// return +// } + +// GetClusterInfo extracts key cluster-related information from Terraform variables. +// It returns cluster prefix as strings. +func GetClusterInfo(options *testhelper.TestOptions) (clusterPrefix string) { + + // Retrieve the cluster prefix if present and of type string if prefix, ok := options.TerraformVars["cluster_prefix"].(string); ok { clusterPrefix = prefix } - return ClusterName, reservationID, clusterPrefix + return } -// SetJobCommands generates job commands customized for the specified solution type and zone. -// For 'hpc' solutions, it dynamically generates commands based on the zone. -// For other solution types, it applies predefined default commands for low and medium-memory tasks. -func SetJobCommands(solution, zone string) (string, string) { - var lowMemJobCmd, medMemJobCmd string - - // Determine the job commands based on the solution type - if strings.Contains(strings.ToLower(solution), "hpc") { - // For HPC solutions, generate job commands dynamically based on the zone - lowMemJobCmd = GetJobCommand(zone, "low") - medMemJobCmd = GetJobCommand(zone, "med") - } else { - // For non-HPC solutions, use predefined default commands - lowMemJobCmd = LSF_JOB_COMMAND_LOW_MEM - medMemJobCmd = LSF_JOB_COMMAND_MED_MEM - } +// GenerateLSFJobCommandsForMemoryTypes generates the LSF job commands for low, medium, and high memory tasks. +// It returns the predefined commands for each job type. +func GenerateLSFJobCommandsForMemoryTypes() (string, string, string) { + // Default job commands for low, medium, and high memory tasks + lowMemJobCmd := LSF_JOB_COMMAND_LOW_MEM + medMemJobCmd := LSF_JOB_COMMAND_MED_MEM + highMemJobCmd := LSF_JOB_COMMAND_HIGH_MEM - // Return the commands for low and medium memory jobs - return lowMemJobCmd, medMemJobCmd + // Return the commands for low, medium, and high memory jobs + return lowMemJobCmd, medMemJobCmd, highMemJobCmd } -// ValidateClusterCreation checks that the cluster was successfully created by running a consistency test. -// It logs any errors encountered and returns an error if the consistency test fails or the output is nil. -func ValidateClusterCreation(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) error { - // Run the consistency test to verify cluster creation +// VerifyClusterCreationAndConsistency validates successful cluster creation and operational +// consistency. It: +// 1. Executes a consistency test via RunTestConsistency() +// 2. Verifies non-nil output +// 3. Provides detailed, traceable errors on failure +// +// Returns nil on success, or an error with context on failure. +// All outcomes are logged through the provided logger. +func VerifyClusterCreationAndConsistency(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) error { + const op = "cluster creation and consistency check" + + // Create a local copy of the test name to prevent race conditions + testName := t.Name() + + // Execute the consistency test - ensure RunTestConsistency() is thread-safe output, err := options.RunTestConsistency() + if err != nil { - logger.Error(t, fmt.Sprintf("Error running consistency test: %v", err)) - return fmt.Errorf("error running consistency test: %v", err) + // Thread-safe logging + logger.Error(t, fmt.Sprintf("%s failed for test %s: %v", op, testName, err)) + return fmt.Errorf("%s failed for test %s: %w", op, testName, err) } - // Ensure that the output is non-nil + // Check output with thread-safe nil check if output == nil { - logger.Error(t, "Expected non-nil output, but got nil") - return fmt.Errorf("expected non-nil output, but got nil") + msg := fmt.Sprintf("%s failed for test %s: nil consistency output", op, testName) + // Thread-safe logging + logger.Error(t, msg) + return fmt.Errorf("%s: %s", op, msg) } - // Log success message - logger.Info(t, t.Name()+" Cluster created successfully") + // Thread-safe success logging + logger.Info(t, fmt.Sprintf("%s: %s passed", testName, op)) return nil } -// GetClusterIPs retrieves server IPs based on the solution type (HPC or LSF). -// It returns the bastion IP, management node IPs, login node IP, and static worker node IPs, -// and an error if the solution type is invalid or there is a problem retrieving the IPs. -func GetClusterIPs(t *testing.T, options *testhelper.TestOptions, solution string, logger *utils.AggregatedLogger) (string, []string, string, []string, error) { - var bastionIP, loginNodeIP string - var managementNodeIPList, staticWorkerNodeIPList []string - var err error +// VerifyClusterCreation checks cluster creation and operational consistency. +// It runs options.RunTest and ensures the output is not nil. +// Logs results and returns an error if validation fails. +func VerifyClusterCreation(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) error { + const op = "cluster creation and consistency check" - // Retrieve server IPs based on solution type - switch { - case strings.EqualFold(solution, "hpc"): - bastionIP, managementNodeIPList, loginNodeIP, err = utils.HPCGetClusterIPs(t, options, logger) - case strings.EqualFold(solution, "lsf"): - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, err = utils.LSFGetClusterIPs(t, options, logger) - default: - return "", nil, "", nil, fmt.Errorf("invalid solution type: %s", solution) + // Create a local copy of the test name to prevent race conditions + testName := t.Name() + + // Execute the consistency test - ensure RunTest is thread-safe + output, err := options.RunTest() + if err != nil { + // Thread-safe logging + logger.Error(t, fmt.Sprintf("%s failed for test %s: %v", op, testName, err)) + return fmt.Errorf("%s failed for test %s: %w", op, testName, err) } - // Return error if any occurred while fetching server IPs + // Check output with thread-safe nil check + if output == "" { + msg := fmt.Sprintf("%s failed for test %s: no output from cluster validation test", op, testName) + logger.Error(t, msg) + return fmt.Errorf("%s: %s", op, msg) + } + + // Thread-safe success logging + logger.Info(t, fmt.Sprintf("%s: %s passed", testName, op)) + return nil +} + +// GetClusterIPs fetches all key server IPs for an LSF cluster, including bastion, management, login, and static worker nodes. +// Returns individual IPs and lists along with an error if retrieval fails. +func GetClusterIPs(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) (string, []string, string, []string, error) { + + bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, err := utils.LSFGetClusterIPs(t, options, logger) if err != nil { - return "", nil, "", nil, fmt.Errorf("error occurred while getting server IPs for solution %s: %w", solution, err) + return "", nil, "", nil, fmt.Errorf("failed to retrieve cluster IPs: %w", err) } return bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, nil } -// GetClusterIPsWithLDAP retrieves server IPs along with LDAP information based on the solution type (HPC or LSF). -// It returns the bastion IP, management node IPs, login node IP, static worker node IPs, -// LDAP server IP, and an error if the solution type is invalid or there is an issue retrieving the IPs. -func GetClusterIPsWithLDAP(t *testing.T, options *testhelper.TestOptions, solution string, logger *utils.AggregatedLogger) (string, []string, string, []string, string, error) { - var bastionIP, loginNodeIP, ldapServerIP string - var managementNodeIPList, staticWorkerNodeIPList []string - var err error - - // Retrieve server IPs with LDAP information based on solution type - switch { - case strings.EqualFold(solution, "hpc"): - bastionIP, managementNodeIPList, loginNodeIP, ldapServerIP, err = utils.HPCGetClusterIPsWithLDAP(t, options, logger) - case strings.EqualFold(solution, "lsf"): - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ldapServerIP, err = utils.LSFGetClusterIPsWithLDAP(t, options, logger) - default: - return "", nil, "", nil, "", fmt.Errorf("invalid solution type: %s", solution) +// GetDeployerIPs retrieves the IP address of the deployer node +// using the provided test options and logs the process using the given logger. +// It returns the deployer IP or an error if the retrieval fails. +func GetDeployerIPs(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) (string, error) { + deployerIP, err := utils.LSFGetDeployerIP(t, options, logger) + if err != nil { + return "", fmt.Errorf("failed to retrieve deployer IP: %w", err) } - // Return error if any occurred while fetching server IPs + return deployerIP, nil +} + +// GetClusterIPsWithLDAP fetches all relevant server IPs for an LSF cluster, including LDAP information. +// Returns bastion, management, login, static worker node IPs, LDAP server IP, and an error if retrieval fails. +func GetClusterIPsWithLDAP(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) ( + string, []string, string, []string, string, error) { + + bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ldapServerIP, err := + utils.LSFGetClusterIPsWithLDAP(t, options, logger) + if err != nil { - return "", nil, "", nil, "", fmt.Errorf("error occurred while getting server IPs for solution %s: %w", solution, err) + return "", nil, "", nil, "", fmt.Errorf("failed to retrieve LSF cluster IPs with LDAP: %w", err) } return bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ldapServerIP, nil } -// GetComputeNodeIPs retrieves dynamic compute node IPs based on the solution type (HPC or LSF). -// It returns a list of compute node IPs and an error if the solution type is invalid or there is a problem retrieving the IPs. -// It also appends static worker node IPs if provided in the input list. -func GetComputeNodeIPs(t *testing.T, sshClient *ssh.Client, logger *utils.AggregatedLogger, solution string, staticWorkerNodeIPList []string) ([]string, error) { - var computeNodeIPList []string - var err error +// GetComputeNodeIPs retrieves compute node IPs for an LSF environment by combining +// dynamically discovered IPs with any optional static worker node IPs. +// +// Parameters: +// - t: *testing.T for test logging context +// - sshClient: Active SSH client for node communication +// - logger: AggregatedLogger for structured logging +// - staticWorkerNodeIPList: Optional list of static worker node IPs +// +// Returns: +// - []string: Unique list of compute node IPs (dynamic + static) +// - error: Wrapped error if retrieval fails or no valid IPs are found +func GetComputeNodeIPs(t *testing.T, sshClient *ssh.Client, staticWorkerNodeIPList []string, logger *utils.AggregatedLogger) ([]string, error) { + const op = "LSF compute node IP retrieval" - // Retrieve dynamic compute node IPs based on solution type - if strings.Contains(solution, "hpc") { - computeNodeIPList, err = HPCGETDynamicComputeNodeIPs(t, sshClient, logger) - if err != nil { - logger.Error(t, fmt.Sprintf("Error retrieving dynamic compute node IPs for HPC: %v", err)) - return nil, fmt.Errorf("error retrieving dynamic compute node IPs for HPC: %w", err) - } - } else if strings.Contains(solution, "lsf") { - computeNodeIPList, err = LSFGETDynamicComputeNodeIPs(t, sshClient, logger) - if err != nil { - logger.Error(t, fmt.Sprintf("Error retrieving dynamic compute node IPs for LSF: %v", err)) - return nil, fmt.Errorf("error retrieving dynamic compute node IPs for LSF: %w", err) - } - } else { - logger.Error(t, "Invalid solution type provided. Expected 'hpc' or 'lsf'.") - return nil, fmt.Errorf("invalid solution type provided: %s", solution) + // Retrieve dynamic IPs from LSF environment + dynamicIPs, err := LSFGETDynamicComputeNodeIPs(t, sshClient, logger) + if err != nil { + logger.Error(t, fmt.Sprintf("%s: failed to get dynamic IPs: %v", op, err)) + return nil, fmt.Errorf("%s: %w", op, err) } - // Append static worker node IPs to the dynamic node IP list if provided - if len(staticWorkerNodeIPList) > 0 { - computeNodeIPList = append(computeNodeIPList, staticWorkerNodeIPList...) - logger.Info(t, fmt.Sprintf("Appended %d static worker node IPs", len(staticWorkerNodeIPList))) + // Combine dynamic and static IPs + allIPs := append(dynamicIPs, staticWorkerNodeIPList...) + uniqueIPs := utils.RemoveDuplicateIPs(allIPs) + + if len(uniqueIPs) == 0 { + err := fmt.Errorf("no compute node IPs found (dynamic or static)") + logger.Error(t, fmt.Sprintf("%s: %v", op, err)) + return nil, fmt.Errorf("%s: %w", op, err) } - // Log the total count of retrieved compute node IPs - logger.Info(t, fmt.Sprintf("Dynamic compute node IPs retrieved successfully. Total IPs: %d", len(computeNodeIPList))) + logger.Info(t, fmt.Sprintf("%s completed: %d dynamic + %d static => %d unique IPs", + op, + len(dynamicIPs), + len(staticWorkerNodeIPList), + len(uniqueIPs))) - return computeNodeIPList, nil + return uniqueIPs, nil } -// GetLDAPServerCredentialsInfo retrieves LDAP-related information from Terraform variables. -// It returns the expected LDAP domain, LDAP admin username,LDAP user username, and LDAP user password. -func GetLDAPServerCredentialsInfo(options *testhelper.TestOptions) (string, string, string, string) { - var expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword string +// GetValidatedLDAPCredentials retrieves and validates LDAP-related credentials +// from Terraform variables. It returns the LDAP domain, admin password, +// user name, and user password. Returns an error if any required value is missing or invalid. +func GetValidatedLDAPCredentials(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) (string, string, string, string, error) { + var ( + expectedLdapDomain string + ldapAdminPassword string // pragma: allowlist secret + ldapUserName string + ldapUserPassword string // pragma: allowlist secret + ok bool + ) - // Retrieve and type-assert values safely - if domain, ok := options.TerraformVars["ldap_basedns"].(string); ok { - expectedLdapDomain = domain + // Extract LDAP domain + if expectedLdapDomain, ok = options.TerraformVars["ldap_basedns"].(string); !ok || expectedLdapDomain == "" { + err := fmt.Errorf("missing or invalid 'ldap_basedns' in TerraformVars") + logger.Error(nil, err.Error()) + return "", "", "", "", err } - if adminPassword, ok := options.TerraformVars["ldap_admin_password"].(string); ok { - ldapAdminPassword = adminPassword // pragma: allowlist secret + + // Extract admin password + if ldapAdminPassword, ok = options.TerraformVars["ldap_admin_password"].(string); !ok || ldapAdminPassword == "" { + err := fmt.Errorf("missing or invalid 'ldap_admin_password' in TerraformVars") + logger.Error(nil, err.Error()) + return "", "", "", "", err } - if userName, ok := options.TerraformVars["ldap_user_name"].(string); ok { - ldapUserName = userName + + // Extract LDAP username + if ldapUserName, ok = options.TerraformVars["ldap_user_name"].(string); !ok || ldapUserName == "" { + err := fmt.Errorf("missing or invalid 'ldap_user_name' in TerraformVars") + logger.Error(nil, err.Error()) + return "", "", "", "", err } - if userPassword, ok := options.TerraformVars["ldap_user_password"].(string); ok { - ldapUserPassword = userPassword // pragma: allowlist secret + + // Extract user password + if ldapUserPassword, ok = options.TerraformVars["ldap_user_password"].(string); !ok || ldapUserPassword == "" { + err := fmt.Errorf("missing or invalid 'ldap_user_password' in TerraformVars") + logger.Error(nil, err.Error()) + return "", "", "", "", err } - return expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword + return expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword, nil } //*****************************LSF Logs***************************** @@ -2371,7 +2536,7 @@ func validateNodeLogFiles(t *testing.T, sClient *ssh.Client, node, sharedLogDir, fmt.Sprintf("%s/lim.log.%s", dirPath, node), fmt.Sprintf("%s/res.log.%s", dirPath, node), fmt.Sprintf("%s/pim.log.%s", dirPath, node), - fmt.Sprintf("%s/Install.log", dirPath), + //fmt.Sprintf("%s/Install.log", dirPath), } case "master": logFiles = []string{ @@ -2408,8 +2573,8 @@ func getFileModificationTime(t *testing.T, sClient *ssh.Client, sharedLogDir, ma } // Parse the output to extract modification time - modTimeStr := strings.TrimSpace(output) - modTime, err := strconv.ParseInt(modTimeStr, 10, 64) + modTimeStr := strings.TrimSpace(output) //stat -c %Y is the correct syntax on most Linux systems to get modification time in epoch seconds. + modTime, err := strconv.ParseInt(modTimeStr, 10, 64) // converts the string timestamp to an integer. if err != nil { logger.Error(t, fmt.Sprintf("Failed to parse modification time from output: %s. Error: %v", modTimeStr, err)) return 0, fmt.Errorf("failed to parse file modification time: %w", err) @@ -2449,7 +2614,7 @@ func LogFilesInSharedFolder(t *testing.T, sClient *ssh.Client, logger *utils.Agg return err } - sharedLogDir := "/mnt/lsf/log" + sharedLogDir := SHAREDLOGDIRPATH for _, node := range managementNodes { if err := validateNodeLogFiles(t, sClient, node, sharedLogDir, "management", logger); err != nil { return err @@ -2476,7 +2641,7 @@ func LogFilesAfterMasterReboot(t *testing.T, sClient *ssh.Client, bastionIP, man return err } - sharedLogDir := "/mnt/lsf/log" + sharedLogDir := SHAREDLOGDIRPATH datePreRestart, err := getFileModificationTime(t, sClient, sharedLogDir, masterName, logger) if err != nil { return err @@ -2544,7 +2709,7 @@ func LogFilesAfterMasterShutdown(t *testing.T, sshClient *ssh.Client, apiKey, re return fmt.Errorf("failed to get current master node name: %w", err) } - sharedLogDir := "/mnt/lsf/log" + sharedLogDir := SHAREDLOGDIRPATH // Shutdown the master node if err := shutdownMasterNode(t, sshClient, oldMasterNodeName, logger); err != nil { @@ -3056,7 +3221,7 @@ func verifyDedicatedHost(t *testing.T, apiKey, region, resourceGroup, clusterPre } // Count the number of worker nodes attached to the dedicated host - actualCount := strings.Count(strings.TrimSpace(string(output)), clusterPrefix+"-worker") + actualCount := strings.Count(strings.TrimSpace(string(output)), clusterPrefix+"-comp") logger.Info(t, fmt.Sprintf("Actual worker node count: %d, Expected: %d", actualCount, expectedWorkerNodeCount)) @@ -3088,7 +3253,7 @@ func VerifyEncryptionCRN(t *testing.T, sshClient *ssh.Client, keyManagement stri } // Command to retrieve CRN configuration - cmd := "cat /opt/ibm/lsf/conf/resource_connector/ibmcloudgen2/conf/ibmcloudgen2_templates.json" + cmd := "cat /opt/ibm/lsfsuite/lsf/conf/resource_connector/ibmcloudgen2/conf/ibmcloudgen2_templates.json" // Iterate over each management node IP in the list for _, managementNodeIP := range managementNodeIPList { @@ -3110,6 +3275,7 @@ func VerifyEncryptionCRN(t *testing.T, sshClient *ssh.Client, keyManagement stri // Determine the expected CRN format based on key management type expectedCRN := "\"crn\":\"crn:v1:bluemix:public:kms" if strings.ToLower(keyManagement) != "key_protect" { + //expectedCRN = "\"crn\":\"\"" expectedCRN = "\"crn\":\"\"" } @@ -3314,10 +3480,10 @@ func VerifyCloudLogsURLFromTerraformOutput(t *testing.T, LastTestTerraformOutput return nil } -// LSFFluentBitServiceForManagementNodes validates Fluent Bit service for management nodes. +// VerifyFluentBitServiceForManagementNodes validates Fluent Bit service for management nodes. // It connects via SSH to each management node, validates the Fluent Bit service state, and logs results. // Returns an error if the process encounters any issues during validation, or nil if successful. -func LSFFluentBitServiceForManagementNodes(t *testing.T, sshClient *ssh.Client, managementNodeIPs []string, isCloudLogsManagementEnabled bool, logger *utils.AggregatedLogger) error { +func VerifyFluentBitServiceForManagementNodes(t *testing.T, sshClient *ssh.Client, managementNodeIPs []string, isCloudLogsManagementEnabled bool, logger *utils.AggregatedLogger) error { // Ensure management node IPs are provided if cloud logs are enabled if isCloudLogsManagementEnabled { @@ -3332,19 +3498,20 @@ func LSFFluentBitServiceForManagementNodes(t *testing.T, sshClient *ssh.Client, return fmt.Errorf("failed Fluent Bit service verification for management node %s: %w", managementIP, err) } } + } else { + logger.Warn(t, "Cloud logging is disabled for management nodes - skipping Fluent Bit validation") } return nil } -// LSFFluentBitServiceForComputeNodes initiates the process of validating Fluent Bit service +// VerifyFluentBitServiceForComputeNodes initiates the process of validating Fluent Bit service // on all compute nodes in a cluster. If cloud logging is enabled, it checks the service // status for each compute node. It returns an error if any node fails the verification. // Returns an error if the process encounters any issues during validation, or nil if successful. -func LSFFluentBitServiceForComputeNodes( +func VerifyFluentBitServiceForComputeNodes( t *testing.T, sshClient *ssh.Client, - expectedSolution string, staticWorkerNodeIPs []string, isCloudLogsComputeEnabled bool, logger *utils.AggregatedLogger) error { @@ -3356,7 +3523,7 @@ func LSFFluentBitServiceForComputeNodes( } // Retrieve compute node IPs from the worker nodes - computeNodeIPs, err := GetComputeNodeIPs(t, sshClient, logger, expectedSolution, staticWorkerNodeIPs) + computeNodeIPs, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil || len(computeNodeIPs) == 0 { return fmt.Errorf("failed to retrieve compute node IPs: %w", err) } @@ -3368,6 +3535,8 @@ func LSFFluentBitServiceForComputeNodes( return fmt.Errorf("failed Fluent Bit service verification for compute node %s: %w", computeIP, err) } } + } else { + logger.Warn(t, "Cloud logging is disabled for compute nodes - skipping Fluent Bit validation") } return nil } @@ -3468,6 +3637,29 @@ func CheckPlatformLogsPresent(t *testing.T, apiKey, region, resourceGroup string return true, nil } +// VerifyPlatformStatus checks if platform logs are enabled in the specified region and logs the outcome. +// It returns an error if there is an issue fetching the platform log status or if the verification fails. +func VerifyPlatformStatus(t *testing.T, apiKey, region, resourceGroup string, isPlatformLogsEnabled bool, logger *utils.AggregatedLogger) error { + + if isPlatformLogsEnabled { + isPlatformLogEnabled, err := CheckPlatformLogsPresent(t, apiKey, region, resourceGroup, logger) + if err != nil { + return fmt.Errorf("failed to check platform logs presence: %w", err) + } + + if utils.VerifyDataContains(t, isPlatformLogEnabled, isPlatformLogsEnabled, logger) { + logger.Info(t, fmt.Sprintf("Platform logs status mismatch for region '%s'. Expected: %v, Got: %v.", region, isPlatformLogsEnabled, isPlatformLogEnabled)) + return fmt.Errorf("platform logs status mismatch for region '%s': expected %v, got %v", region, isPlatformLogsEnabled, isPlatformLogEnabled) + } + + logger.Info(t, fmt.Sprintf("Platform logs found for region '%s' as expected.", region)) + } else { + logger.Warn(t, "Platform logging is disabled - skipping validation") + } + + return nil +} + // VerifyCloudMonitoringURLFromTerraformOutput validates the cloud log URL in Terraform outputs. // It checks required fields in the Terraform output map and ensures the cloud logs URL // is present when cloud logging is enabled for either management or compute nodes. @@ -3527,13 +3719,18 @@ func LSFPrometheusAndDragentServiceForManagementNodes(t *testing.T, sshClient *s return fmt.Errorf("failed Prometheus service verification for management node %s: %w", managementIP, err) } + err = VerifyLSFPrometheusExportServiceForNode(t, sshClient, managementIP, logger) + if err != nil { + return fmt.Errorf("failed Prometheus export service verification for management node %s: %w", managementIP, err) + } + err = VerifyLSFdragentServiceForNode(t, sshClient, managementIP, logger) if err != nil { return fmt.Errorf("failed dragent service verification for management node %s: %w", managementIP, err) } } } else { - logger.Warn(t, "Cloud monitoring are not enabled for the management node. As a result, the Prometheus and Fluent dragent service will not be validated.") + logger.Warn(t, "Cloud monitoring is disabled for management nodes - skipping validation of Prometheus, Prometheus_Exporter, and Dragent service agents.") } return nil @@ -3546,7 +3743,6 @@ func LSFPrometheusAndDragentServiceForManagementNodes(t *testing.T, sshClient *s func LSFDragentServiceForComputeNodes( t *testing.T, sshClient *ssh.Client, - expectedSolution string, staticWorkerNodeIPs []string, isCloudMonitoringEnabledForCompute bool, logger *utils.AggregatedLogger) error { @@ -3558,7 +3754,7 @@ func LSFDragentServiceForComputeNodes( } // Retrieve compute node IPs from the worker nodes - computeNodeIPs, err := GetComputeNodeIPs(t, sshClient, logger, expectedSolution, staticWorkerNodeIPs) + computeNodeIPs, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil || len(computeNodeIPs) == 0 { return fmt.Errorf("failed to retrieve compute node IPs: %w", err) } @@ -3573,7 +3769,7 @@ func LSFDragentServiceForComputeNodes( } } else { - logger.Warn(t, "Cloud monitoring are not enabled for the compute node. As a result, the dragent service will not be validated.") + logger.Warn(t, "Cloud monitoring is disabled for compute nodes - skipping validation of Dragent service agent.") } return nil } @@ -3594,7 +3790,7 @@ func VerifyLSFPrometheusServiceForNode( return fmt.Errorf("failed to execute command '%s' on node %s: %w", command, nodeIP, err) } - // Expected Fluent Bit service state should be "active (running)" + // Expected prometheus service state should be "active (running)" expectedState := "Active: active (running)" // Verify if the service is in the expected running state @@ -3606,11 +3802,46 @@ func VerifyLSFPrometheusServiceForNode( ) } - // Log success if Fluent Bit service is running as expected + // Log success if prometheus service is running as expected logger.Info(t, fmt.Sprintf("Prometheus service validation passed for node %s", nodeIP)) return nil } +// VerifyLSFPrometheusExportServiceForNode checks the status of the Prometheus export service on a given node. +// It ensures the service is running and returns an error if its state does not match "active (running)." +func VerifyLSFPrometheusExportServiceForNode( + t *testing.T, + sshClient *ssh.Client, + nodeIP string, + logger *utils.AggregatedLogger) error { + + // Command to check the status of Prometheus export service on the node + command := fmt.Sprintf("ssh %s systemctl status lsf_prometheus_exporter", nodeIP) + output, err := utils.RunCommandInSSHSession(sshClient, command) + if err != nil { + // Return an error if the command fails to execute + return fmt.Errorf("failed to execute command '%s' on node %s: %w", command, nodeIP, err) + } + + // Expected lsf prometheus exporter service state should be "active (running)" + expectedState := "Active: active (running)" + + // Verify if the lsf prometheus exporter service is in the expected running state + if !utils.VerifyDataContains(t, output, expectedState, logger) { + // If the service state does not match the expected state, return an error with output + return fmt.Errorf( + "unexpected Prometheus export state for node %s: expected '%s', got:\n%s", + nodeIP, expectedState, output, + ) + } + + // Log success if lsf prometheus exporter service is running as expected + logger.Info(t, fmt.Sprintf("Prometheus export service validation passed for node %s", nodeIP)) + return nil +} + +//systemctl status lsf_prometheus_exporter + // VerifyLSFDragentServiceForNode checks the status of the Dragent service on a given node. // It ensures the service is running and returns an error if its state does not match "active (running)." func VerifyLSFdragentServiceForNode( @@ -3668,7 +3899,7 @@ func ValidateDynamicWorkerProfile(t *testing.T, apiKey, region, resourceGroup, c } // Fetch the dynamic worker node profile - dynamicWorkerProfileCmd := fmt.Sprintf("ibmcloud is instances | grep %s | awk '/-compute-/ && !/-worker-|-login-|-mgmt-|-bastion-/ {print $6; exit}'", clusterPrefix) + dynamicWorkerProfileCmd := fmt.Sprintf("ibmcloud is instances | grep %s | awk '!/-comp-|-login-|-mgmt-|-bastion-|-deployer-/ {print $6; exit}'", clusterPrefix) cmd = exec.Command("bash", "-c", dynamicWorkerProfileCmd) dynamicWorkerProfile, err := cmd.CombinedOutput() if err != nil { @@ -3816,3 +4047,259 @@ func ValidateAtrackerRouteTarget(t *testing.T, apiKey, region, resourceGroup, cl return nil } + +// LSFCheckSSHConnectivityToNodesFromManagement verifies SSH connectivity from the primary management node +// to other nodes in the cluster, including secondary management nodes, compute nodes, and the login node. +func LSFCheckSSHConnectivityToNodesFromManagement( + t *testing.T, + sshClient *ssh.Client, + managementNodeIPList, computeNodeIPList []string, + loginNodeIP string, + logger *utils.AggregatedLogger, +) error { + + // Check if management node IP list has at least two entries (primary + at least one secondary) + if len(managementNodeIPList) <= 1 { + return fmt.Errorf("not enough management node IPs to process after removing the first entry") + } + + // Iterate over each secondary management node IP (skipping the first, assuming it's the primary) + for _, managementNodeIP := range managementNodeIPList[1:] { + command := fmt.Sprintf("ssh %s 'hostname'", managementNodeIP) + actualOutput, err := utils.RunCommandInSSHSession(sshClient, command) + if err != nil { + return fmt.Errorf("failed to run SSH command on management node IP %s: %w", managementNodeIP, err) + } + + if !utils.VerifyDataContains(t, actualOutput, "mgmt", logger) { + return fmt.Errorf("management node hostname '%v' does not contain 'mgmt' substring for node IP '%s'", actualOutput, managementNodeIP) + } + } + + // Check if compute node IP list is empty + if len(computeNodeIPList) == 0 { + return fmt.Errorf("ERROR: compute node IP list cannot be empty") + } + + // Iterate over each compute node IP + for _, computeNodeIP := range computeNodeIPList { + command := fmt.Sprintf("ssh -o ConnectTimeout=12 -q %s exit", computeNodeIP) + _, err := utils.RunCommandInSSHSession(sshClient, command) + if err != nil { + return fmt.Errorf("failed to run SSH command on compute node IP %s: %w", computeNodeIP, err) + } + } + + // Check SSH connectivity to the login node + loginCommand := fmt.Sprintf("ssh -o ConnectTimeout=12 -q %s exit", loginNodeIP) + _, err := utils.RunCommandInSSHSession(sshClient, loginCommand) + if err != nil { + return fmt.Errorf("failed to run SSH command on login node IP %s: %w", loginNodeIP, err) + } + + // Log success + logger.Info(t, "SSH connectivity check from the primary management node to all other nodes completed successfully") + return nil +} + +// CheckLSFHosts runs 'bhosts -w' command via SSH to get the LSF host status. +// It logs the output for debugging and returns any command execution errors. +func CheckLSFHosts(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { + statusCmd := "bhosts -w" + output, err := utils.RunCommandInSSHSession(sClient, statusCmd) + if err != nil { + return fmt.Errorf("failed to run '%s': %w", statusCmd, err) + } + + logger.DEBUG(t, fmt.Sprintf("'bhosts -w' output:\n%s", string(output))) + return nil +} + +// ValidateLSFConfig verifies LSF configuration health by running 'lsadmin ckconfig -v'. +// It checks for a success message in the output and logs it for debugging purposes. +func ValidateLSFConfig(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { + expectedMessage := "No errors found." + statusCmd := "sudo su -l root -c 'lsadmin ckconfig -v'" + + output, err := utils.RunCommandInSSHSession(sClient, statusCmd) + if err != nil { + return fmt.Errorf("failed to run '%s': %w", statusCmd, err) + } + + // Trim whitespace and check for empty output + trimmedOutput := strings.TrimSpace(string(output)) + if trimmedOutput == "" { + return fmt.Errorf("LSF health check failed: command returned empty output") + } + + logger.DEBUG(t, fmt.Sprintf("lsadmin ckconfig -v output:\n%s", trimmedOutput)) + + if !utils.VerifyDataContains(t, trimmedOutput, expectedMessage, logger) { + return fmt.Errorf("LSF health check failed: expected message '%s' not found in output:\n%s", + expectedMessage, trimmedOutput) + } + + return nil +} + +// LSFHealthCheck verifies if the LSF daemon (lsfd) is running and healthy on the target host. +func LSFHealthCheck(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { + const expectedMessage = "Active: active (running)" + + // Define the command to check lsfd status + statusCmd := "sudo su -l root -c 'systemctl status lsfd'" + + // Run the systemctl command on the remote host + output, err := utils.RunCommandInSSHSession(sClient, statusCmd) + if err != nil { + return fmt.Errorf("failed to run '%s': %w", statusCmd, err) + } + + // Check if the output contains the expected active message + if !utils.VerifyDataContains(t, string(output), expectedMessage, logger) { + return fmt.Errorf("LSF health check failed: expected message '%s' not found in output:\n%s", expectedMessage, string(output)) + } + + return nil +} + +// ValidateTerraformOutput connects to the LSF deployer node via SSH, +// fetches Terraform outputs, and validates: +// - cloud_logs_url (if cloud logging is enabled) +// - cloud_monitoring_url (if cloud monitoring is enabled) +// - ssh_to_ldap_node (if LDAP is enabled) +// - application_center_url, ssh_to_deployer, ssh_to_management_node +func ValidateTerraformOutput( + t *testing.T, + bastionIP, deployerIP string, + isCloudLogEnabled, isCloudMonitoringEnabled, isldapServerEnabled bool, + logger *utils.AggregatedLogger, +) error { + + // Establish SSH connection to the deployer node + sDeployerClient, err := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_DEPLOYER_HOST_NAME, deployerIP) + require.NoError(t, err, "Failed to connect to the deployer node via SSH") + + defer func() { + if cerr := sDeployerClient.Close(); cerr != nil { + logger.Info(t, fmt.Sprintf("Failed to close SSH client: %v", cerr)) + } + }() + + // Run terraform output command + cmd := "cd /opt/ibm/terraform-ibm-hpc && terraform output" + output, err := utils.RunCommandInSSHSession(sDeployerClient, cmd) + if err != nil { + return fmt.Errorf("failed to run '%s': %w", cmd, err) + } + logger.DEBUG(t, "Terraform OUTPUT:\n"+string(output)) + + lines := strings.Split(string(output), "\n") + + // Initialize validation flags outside the loop + isCloudvalidated := false + isCloudMonitoringvalidated := false + isldapServervalidated := false + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + // Validate cloud_logs_url + if isCloudLogEnabled && strings.Contains(line, "cloud_logs_url") { + url := utils.ExtractTerraformValue(line) + logger.DEBUG(t, fmt.Sprintf("'cloud_logs_url' output: %s", url)) + + curlCmd := fmt.Sprintf("curl -I '%s'", url) + actualOutput, err := utils.RunCommandInSSHSession(sDeployerClient, curlCmd) + if err != nil || !utils.VerifyDataContains(t, string(actualOutput), "200", logger) { + return fmt.Errorf("cloud_logs_url validation failed. Output: %s", actualOutput) + } + isCloudvalidated = true + logger.Info(t, "✅ cloud_logs_url validated successfully.") + } + + // Validate cloud_monitoring_url + if isCloudMonitoringEnabled && strings.Contains(line, "cloud_monitoring_url") { + url := utils.ExtractTerraformValue(line) + logger.DEBUG(t, fmt.Sprintf("'cloud_monitoring_url' output: %s", url)) + + expectedPrefix := "https://cloud.ibm.com/observe/embedded-view/monitoring/" + if !strings.HasPrefix(url, expectedPrefix) { + return fmt.Errorf("cloud_monitoring_url mismatch. Output: %s, Expected prefix: %s", url, expectedPrefix) + } + isCloudMonitoringvalidated = true + logger.Info(t, "✅ cloud_monitoring_url validated successfully.") + } + + // Validate ssh_to_ldap_node + if isldapServerEnabled && strings.Contains(line, "ssh_to_ldap_node") { + url := utils.ExtractTerraformValue(line) + logger.DEBUG(t, fmt.Sprintf("'ssh_to_ldap_node' output: %s", url)) + + expectedPrefix := "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o" + if !strings.HasPrefix(url, expectedPrefix) { + return fmt.Errorf("ssh_to_ldap_node mismatch. Output: %s, Expected prefix: %s", url, expectedPrefix) + } + isldapServervalidated = true + logger.Info(t, "✅ ssh_to_ldap_node validated successfully.") + } + + // Validate application_center_tunnel + if strings.Contains(line, "application_center_tunnel") { + url := utils.ExtractTerraformValue(line) + logger.DEBUG(t, fmt.Sprintf("'application_center_tunnel' output: %s", url)) + + expected := `ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -L 8443` + if !utils.VerifyDataContains(t, line, expected, logger) { + return fmt.Errorf("application_center_tunnel string missing or incorrect in terraform output") + } + logger.Info(t, "✅ application_center_tunnel validated successfully.") + } + + // Validate application_center_url + if strings.Contains(line, "application_center_url") { + url := utils.ExtractTerraformValue(line) + logger.DEBUG(t, fmt.Sprintf("'application_center_url' output: %s", url)) + + expected := `https://localhost:8443` + if !utils.VerifyDataContains(t, line, expected, logger) { + return fmt.Errorf("application_center_url string missing or incorrect in terraform output") + } + logger.Info(t, "✅ application_center_url validated successfully.") + } + + // Validate ssh_to_deployer + if strings.Contains(line, "ssh_to_deployer") { + expected := `ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J` + if !utils.VerifyDataContains(t, line, expected, logger) { + return fmt.Errorf("ssh_to_deployer string missing or incorrect in terraform output") + } + logger.Info(t, "✅ ssh_to_deployer validated successfully.") + } + + // Validate ssh_to_management_node + if strings.Contains(line, "ssh_to_management_node") { + expected := `ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -J` + if !utils.VerifyDataContains(t, line, expected, logger) { + return fmt.Errorf("ssh_to_management_node string missing or incorrect in terraform output") + } + logger.Info(t, "✅ ssh_to_management_node validated successfully.") + } + } + + // Final validation checks to ensure expected outputs were found + if isCloudLogEnabled && !isCloudvalidated { + return fmt.Errorf("cloud_logs_url not found in terraform output") + } + if isCloudMonitoringEnabled && !isCloudMonitoringvalidated { + return fmt.Errorf("cloud_monitoring_url not found in terraform output") + } + if isldapServerEnabled && !isldapServervalidated { + return fmt.Errorf("ssh_to_ldap_node not found in terraform output") + } + + return nil +} diff --git a/tests/lsf/cluster_validation.go b/tests/lsf/cluster_validation.go index 87966088..55b99220 100644 --- a/tests/lsf/cluster_validation.go +++ b/tests/lsf/cluster_validation.go @@ -1,158 +1,163 @@ package tests import ( + "encoding/json" "fmt" "os" "strconv" - "strings" "testing" "github.com/stretchr/testify/require" "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" + "golang.org/x/crypto/ssh" ) -// ValidateClusterConfiguration performs comprehensive validation on the cluster setup. -// It connects to various cluster components via SSH and verifies their configurations and functionality. -// This includes the following validations: -// - Management Node: Verifies the configuration of the management node, including failover and failback procedures. -// - Compute Nodes: Ensures proper configuration and SSH connectivity to compute nodes. -// - Login Node: Validates the configuration and SSH connectivity to the login node. -// - Dynamic Compute Nodes: Verifies the proper setup and functionality of dynamic compute nodes. -// Additionally, this function logs detailed information throughout the validation process. -// This function doesn't return any value but logs errors and validation steps during the process. -func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) +type ExpectedClusterConfig struct { + MasterName string + ResourceGroup string + KeyManagement string + Zones string + NumOfKeys int + DnsDomainName string + Hyperthreading bool + LsfVersion string +} - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") +// GetExpectedClusterConfig retrieves and structures the expected cluster +// configuration from Terraform output variables. +func GetExpectedClusterConfig(t *testing.T, options *testhelper.TestOptions) ExpectedClusterConfig { + masterName := utils.GetStringVarWithDefault(options.TerraformVars, "cluster_prefix", "") + resourceGroup := utils.GetStringVarWithDefault(options.TerraformVars, "existing_resource_group", "") + keyManagement := utils.GetStringVarWithDefault(options.TerraformVars, "key_management", "null") + lsfVersion := utils.GetStringVarWithDefault(options.TerraformVars, "lsf_version", "") + + zone := options.TerraformVars["zones"].([]string)[0] + numKeys := len(options.TerraformVars["ssh_keys"].([]string)) + + dnsJSON := options.TerraformVars["dns_domain_name"].(string) + var dnsMap map[string]string + require.NoError(t, json.Unmarshal([]byte(dnsJSON), &dnsMap), "Failed to unmarshal dns_domain_name") + + hyperthreading, err := strconv.ParseBool(options.TerraformVars["enable_hyperthreading"].(string)) + require.NoError(t, err, "Failed to parse enable_hyperthreading from Terraform vars") + + return ExpectedClusterConfig{ + MasterName: masterName, + ResourceGroup: resourceGroup, + KeyManagement: keyManagement, + Zones: zone, + NumOfKeys: numKeys, + DnsDomainName: dnsMap["compute"], + Hyperthreading: hyperthreading, + LsfVersion: lsfVersion, } +} - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) +// runClusterValidationsOnManagementNode performs a series of validation +// checks on the management nodes of the LSF cluster. This includes +// verifying configuration, SSH keys, DNS, failover, and daemon restarts. +func runClusterValidationsOnManagementNode(t *testing.T, sshClient *ssh.Client, bastionIP string, managementNodeIPs []string, expected ExpectedClusterConfig, jobCmd string, logger *utils.AggregatedLogger) { - // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, "Running management node and App Center validations sequentially...") - // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + // Verify management node configuration + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + // Run job + VerifyJobs(t, sshClient, jobCmd, logger) - testLogger.Info(t, "SSH connection to the master successful") - t.Log("Validation in progress. Please wait...") + // Verify application center configuration + VerifyAPPCenterConfig(t, sshClient, logger) - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + // Verify noVNC configuration + //VerifyNoVNCConfig(t, sshClient, logger) // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPs, expected.NumOfKeys, logger) // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) + VerifyLSFDNS(t, sshClient, managementNodeIPs, expected.DnsDomainName, logger) // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) + FailoverAndFailback(t, sshClient, jobCmd, logger) // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) + RestartLsfDaemon(t, sshClient, logger) // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0], logger) - // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + logger.Info(t, "Management node and App Center validations completed.") +} - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() +// runClusterValidationsOnComputeNode executes validation steps specific +// to the compute nodes in the LSF cluster. This includes running jobs, +// verifying node configuration, SSH keys, and DNS settings. +func runClusterValidationsOnComputeNode(t *testing.T, sshClient *ssh.Client, bastionIP string, staticWorkerNodeIPs []string, expected ExpectedClusterConfig, jobCmd string, logger *utils.AggregatedLogger) { - // Wait for dynamic node disappearance and handle potential errors - defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { - t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) - } - }() + logger.Info(t, "Running compute node validations sequentially...") // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) + VerifyJobs(t, sshClient, jobCmd, logger) // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil { t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) } // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expected.Hyperthreading, computeNodeIPList, logger) // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expected.NumOfKeys, logger) // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) + VerifyLSFDNS(t, sshClient, computeNodeIPList, expected.DnsDomainName, logger) - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") + logger.Info(t, "Compute node validations completed.") +} + +// runClusterValidationsOnLoginNode conducts validations on the LSF login +// node, including verifying its configuration and SSH connectivity to +// management and compute nodes. +func runClusterValidationsOnLoginNode(t *testing.T, bastionIP, loginNodeIP string, expected ExpectedClusterConfig, managementNodeIPs, computeNodeIPs []string, jobCmd string, logger *utils.AggregatedLogger) { + + logger.Info(t, "Running login node validations sequentially...") + + // Connect to the master node via SSH and handle connection errors + loginSSHClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to login node via bastion (%s) -> private IP (%s): %v", bastionIP, loginNodeIP, connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) + if err := loginSSHClient.Close(); err != nil { + logger.Info(t, fmt.Sprintf("Failed to close SSH connection: %v", err)) } }() // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + VerifyLoginNodeConfig(t, loginSSHClient, expected.MasterName, expected.Hyperthreading, loginNodeIP, jobCmd, expected.LsfVersion, logger) - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + // Get compute node IPs and handle errors + computeNodeIPs, connectionErr = GetComputeNodeIPs(t, loginSSHClient, computeNodeIPs, logger) + if connectionErr != nil { + t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", connectionErr) + } - // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + // Verify SSH connectivity from login node + VerifySSHConnectivityToNodesFromLogin(t, loginSSHClient, managementNodeIPs, computeNodeIPs, logger) - // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, "Login node validations completed.") } -// ValidateClusterConfigurationWithAPPCenter performs validation tasks on the cluster configuration -// with additional verification for an application center and noVNC configurations. -// It extends the validation performed by ValidateClusterConfiguration to include checks for these additional components. -// This function connects to various cluster components via SSH and verifies their configurations and functionality. -// It includes the following validations: +// ValidateClusterConfiguration performs comprehensive validation on the cluster setup. +// It connects to various cluster components via SSH and verifies their configurations and functionality. +// This includes the following validations: // - Management Node: Verifies the configuration of the management node, including failover and failback procedures. // - Compute Nodes: Ensures proper configuration and SSH connectivity to compute nodes. // - Login Node: Validates the configuration and SSH connectivity to the login node. @@ -161,137 +166,79 @@ func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, // - noVNC: Verifies the noVNC configuration. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") +func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, false, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - - // Verify application center configuration - VerifyAPPCenterConfig(t, sshClient, testLogger) - - // Verify noVNC configuration - VerifyNoVNCConfig(t, sshClient, testLogger) - - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) - - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) - - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) - - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateClusterConfigurationWithPACHA performs validation tasks on the cluster configuration @@ -307,159 +254,84 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // - noVNC: Verifies the noVNC configuration. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidateClusterConfigurationWithPACHA(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) - - // Run the test consistency check - output, err := options.RunTestConsistency() - if err != nil { - testLogger.Error(t, fmt.Sprintf("Error running consistency test: %v", err)) - require.NoError(t, err, "error running consistency test: %v", err) - } +func ValidateClusterConfigurationWithPACHA(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - // Ensure that the output is non-nil - if output == nil { - testLogger.Error(t, "Expected non-nil output, but got nil") - require.NotNil(t, output, "expected non-nil output, but got nil") - } + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - outputErr := ValidateTerraformPACOutputs(t, options.LastTestTerraformOutputs, expectedDnsDomainName, testLogger) - require.NoError(t, outputErr, "Error occurred while out server IPs: %v", outputErr) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Log success message - testLogger.Info(t, t.Name()+" Cluster created successfully") + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, false, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - - // Verify application center configuration - VerifyAPPCenterConfig(t, sshClient, testLogger) - - // Verify PACHA configuration by validating the application center setup. - ValidatePACHAOnManagementNodes(t, sshClient, expectedDnsDomainName, bastionIP, managementNodeIPList, testLogger) - - // Verify noVNC configuration - VerifyNoVNCConfig(t, sshClient, testLogger) - - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) - - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) - - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) - - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") - - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() // Verify PACHA configuration by validating the application center setup. - ValidatePACHAOnManagementNodes(t, sshClient, expectedDnsDomainName, bastionIP, managementNodeIPList, testLogger) - - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } + ValidatePACHAOnManagementNodes(t, sshClient, expected.DnsDomainName, bastionIP, managementNodeIPs, logger) // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Verify PACHA Failover configuration by validating the application center setup. - ValidatePACHAFailoverHealthCheckOnManagementNodes(t, sshClient, expectedDnsDomainName, bastionIP, managementNodeIPList, testLogger) + ValidatePACHAFailoverHealthCheckOnManagementNodes(t, sshClient, expected.DnsDomainName, bastionIP, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfiguration validates basic cluster configuration. @@ -467,96 +339,74 @@ func ValidateClusterConfigurationWithPACHA(t *testing.T, options *testhelper.Tes // including the management node, compute nodes, and login node configurations. // Additionally, it ensures proper connectivity and functionality. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) +func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Set job commands based on solution type - jobCommandLow, _ := SetJobCommands(expectedSolution, expectedZone) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Get the job command for low memory tasks and ignore the other ones + jobCommandLow, _, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, false, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) + VerifyJobs(t, sshClient, jobCommandLow, logger) // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil { t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) } // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() - - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyComputeNodeConfig(t, sshClient, expected.Hyperthreading, computeNodeIPList, logger) - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + // Verify login node configuration configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationWithDynamicProfile validates basic cluster configuration. @@ -565,100 +415,69 @@ func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOpt // Additionally, it ensures proper connectivity and functionality. // The dynamic worker node profile should be created based on the first worker instance type object. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidateBasicClusterConfigurationWithDynamicProfile(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - - expectedZone := options.TerraformVars["zones"].([]string)[0] - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") +func ValidateBasicClusterConfigurationWithDynamicProfile(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() // Run job - VerifyJobs(t, sshClient, jobCommandMed, testLogger) + VerifyJobs(t, sshClient, jobCommandMed, logger) // Verify dynamic node profile - ValidateDynamicNodeProfile(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, options, testLogger) + ValidateDynamicNodeProfile(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, options, logger) // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil { t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) } // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() - - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyComputeNodeConfig(t, sshClient, expected.Hyperthreading, computeNodeIPList, logger) - // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + // Verify login node configuration configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateLDAPClusterConfiguration performs comprehensive validation on the cluster setup. @@ -666,153 +485,126 @@ func ValidateBasicClusterConfigurationWithDynamicProfile(t *testing.T, options * // This includes validations for management nodes, compute nodes, login nodes, dynamic compute nodes, and LDAP integration. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { +func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { + // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword := GetLDAPServerCredentialsInfo(options) - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) - - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + expected := GetExpectedClusterConfig(t, options) + + expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword, getLDAPCredentialsErr := GetValidatedLDAPCredentials(t, options, logger) + require.NoError(t, getLDAPCredentialsErr, "Error occurred while getting LDAP credentials") + + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, ldapServerIP, getClusterIPErr := GetClusterIPsWithLDAP(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") + + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Error occurred while getting deployer IPs") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ldapServerIP, ipRetrievalError := GetClusterIPsWithLDAP(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, true, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) - // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) + // Wait for dynamic node disappearance and handle potential errors + defer func() { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) + t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) + } + }() - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) + // Verify compute node configuration + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + // Verify login node configuration configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) - // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + // Verify LSF DNS settings on login node + VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + // Verify file share encryption + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) + + // Connect to the LDAP server via SSH and handle connection errors + sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) + require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH") - // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { - t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) + if err := sshLdapClient.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) } }() // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) + VerifyJobs(t, sshClient, jobCommandLow, logger) // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil { t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + // Check LDAP server status + CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, logger) - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + // Verify management node LDAP config + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) + // Verify compute node LDAP config + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) - // Verify SSH connectivity from login node and handle connection errors + // Verify SSH connectivity from login node sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") defer func() { if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) } }() - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) - - // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) - - // Connect to the LDAP server via SSH and handle connection errors - sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) - require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH") - - defer func() { - if err := sshLdapClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) - } - }() - - // Check LDAP server status - CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) - - // Verify management node LDAP config - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - - // Verify compute node LDAP config - VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - - // Verify login node LDAP config - VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify login node configuration LDAP config + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) // Verify ability to create LDAP user and perform LSF actions using new user - VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "tester2", testLogger) + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, ldapUserName, ldapAdminPassword, expectedLdapDomain, NEW_LDAP_USER_NAME, NEW_LDAP_USER_PASSWORD, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidatePACANDLDAPClusterConfiguration performs comprehensive validation on the PAC and LDAP cluster setup. @@ -820,128 +612,74 @@ func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOpti // This includes validations for management nodes, compute nodes, login nodes, dynamic compute nodes, LDAP server, application center, and noVNC. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. -func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { +func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { + // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword := GetLDAPServerCredentialsInfo(options) - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) - - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + expected := GetExpectedClusterConfig(t, options) - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ldapServerIP, ipRetrievalError := GetClusterIPsWithLDAP(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword, getLDAPCredentialsErr := GetValidatedLDAPCredentials(t, options, logger) + require.NoError(t, getLDAPCredentialsErr, "Error occurred while getting LDAP credentials") + + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, ldapServerIP, getClusterIPErr := GetClusterIPsWithLDAP(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") + + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") + + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + // verify terraform outpu + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, true, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - - // Verify application center configuration - VerifyAPPCenterConfig(t, sshClient, testLogger) - - // Verify noVNC configuration - VerifyNoVNCConfig(t, sshClient, testLogger) - - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) - - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) - - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) - - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() - - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + // Verify login node configuration configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Connect to the LDAP server via SSH and handle connection errors sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) @@ -949,242 +687,132 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te defer func() { if err := sshLdapClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) } }() + // Get compute node IPs and handle errors + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) + if err != nil { + t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) + } + // Check LDAP server status - CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) + CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, logger) // Verify management node LDAP config - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) // Verify compute node LDAP config - VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) + + // Verify SSH connectivity from login node + sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) + require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - // Verify login node LDAP config - VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + defer func() { + if err := sshLoginNodeClient.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) + } + }() + + // Verify login node configuration LDAP config + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) // Verify ability to create LDAP user and perform LSF actions using new user - VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "tester2", testLogger) + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, ldapUserName, ldapAdminPassword, expectedLdapDomain, NEW_LDAP_USER_NAME, NEW_LDAP_USER_PASSWORD, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } -// ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos validates the basic cluster configuration -// including VPC flow logs and COS service instance. -// It performs validation tasks on essential aspects of the cluster setup, -// such as management node, compute nodes, and login node configurations. -// Additionally, it ensures proper connectivity and functionality. -// This function doesn't return any value but logs errors and validation steps during the process. -func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) +// ValidateExistingLDAPClusterConfig performs comprehensive validation on an existing LDAP cluster configuration. +// It connects to various cluster components via SSH to verify their configurations and functionality, +// including management nodes, compute nodes, login nodes, dynamic compute nodes, and LDAP integration. +// This function logs detailed information throughout the validation process and does not return any value. +func ValidateExistingLDAPClusterConfig(t *testing.T, ldapServerBastionIP, ldapServerIP, expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword string, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - // Set job commands based on solution type - jobCommandLow, _ := SetJobCommands(expectedSolution, expectedZone) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() - // - fmt.Println(loginNodeIP, expectedDnsDomainName) // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) - - // Validate COS service instance and VPC flow logs - ValidateCosServiceInstanceAndVpcFlowLogs(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, testLogger) - - // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") -} - -// ValidateClusterConfigurationWithMultipleKeys performs a comprehensive validation on the cluster setup. -// It connects to various cluster components via SSH and verifies their configurations and functionality, -// including management nodes, compute nodes, login nodes, and dynamic compute nodes. It also performs -// additional validation checks like failover procedures, SSH key verification, and DNS verification. -// The function logs detailed information throughout the validation process but does not return any value. -func ValidateClusterConfigurationWithMultipleKeys(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) - - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } - - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) - - // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") - - // Connect to the management node via SSH - sshClientOne, sshClientTwo, connectionErrOne, connectionErrTwo := utils.ConnectToHostsWithMultipleUsers(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErrOne, "Failed to connect to the master via SSH") - require.NoError(t, connectionErrTwo, "Failed to connect to the master via SSH") - - defer func() { - if err := sshClientOne.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClientOne: %v", err)) - } - }() - - defer func() { - if err := sshClientTwo.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClientTwo: %v", err)) - } - }() + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) - testLogger.Info(t, "SSH connection to the master successful") - t.Log("Validation in progress. Please wait...") - - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClientOne, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - VerifyManagementNodeConfig(t, sshClientTwo, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - - // Verify SSH key on management node - VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - - // Perform failover and failback - FailoverAndFailback(t, sshClientOne, jobCommandMed, testLogger) - - // Restart LSF daemon - RestartLsfDaemon(t, sshClientOne, testLogger) - - // Reboot instance - RebootInstance(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) - - // Reconnect to the management node after reboot - sshClientOne, connectionErrOne = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErrOne, "Failed to reconnect to the master via SSH: %v", connectionErrOne) - - defer func() { - if err := sshClientOne.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClientOne: %v", err)) - } - }() + // Connect to the LDAP server via SSH + sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, ldapServerBastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) + require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH") - // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClientOne, testLogger); err != nil { - t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) + if err := sshLdapClient.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) } }() - // Run job - VerifyJobs(t, sshClientOne, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClientOne, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClientOne, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + // Check LDAP server status + CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, logger) - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + // Verify management node LDAP configuration + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClientOne, computeNodeIPList, expectedDnsDomainName, testLogger) + // Verify compute node LDAP configuration + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, managementNodeIPs, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) // Verify SSH connectivity from login node sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) @@ -1192,183 +820,94 @@ func ValidateClusterConfigurationWithMultipleKeys(t *testing.T, options *testhel defer func() { if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) } }() - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + // Verify login node configuration LDAP configuration + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, logger) - // Get compute node IPs and handle errors - computeNodeIPList, err = GetComputeNodeIPs(t, sshClientOne, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClientOne, []string{loginNodeIP}, expectedDnsDomainName, testLogger) - - // Verify file share encryption - VerifyFileShareEncryption(t, sshClientOne, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + // Verify LDAP user creation and LSF actions using the new user + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, ldapUserName, ldapAdminPassword, expectedLdapDomain, NEW_LDAP_USER_NAME, NEW_LDAP_USER_PASSWORD, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } -// ValidateExistingLDAPClusterConfig performs comprehensive validation on an existing LDAP cluster configuration. -// It connects to various cluster components via SSH to verify their configurations and functionality, -// including management nodes, compute nodes, login nodes, dynamic compute nodes, and LDAP integration. -// This function logs detailed information throughout the validation process and does not return any value. -func ValidateExistingLDAPClusterConfig(t *testing.T, ldapServerBastionIP, ldapServerIP, expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword string, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { +// ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos validates the basic cluster configuration +// including VPC flow logs and COS service instance. +// It performs validation tasks on essential aspects of the cluster setup, +// such as management node, compute nodes, and login node configurations. +// Additionally, it ensures proper connectivity and functionality. +// This function doesn't return any value but logs errors and validation steps during the process. +func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) + expected := GetExpectedClusterConfig(t, options) - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) - - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Get the job command for low memory tasks and ignore the other ones + jobCommandLow, _, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") - + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) - - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on management nodes - VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) - - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) - - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) - - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) // Reconnect to the management node after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH key on compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - - // Verify LSF DNS on compute nodes - VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + // Verify LSF DNS on login node + //VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) - - // Connect to the LDAP server via SSH - sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, ldapServerBastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) - require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH") - - defer func() { - if err := sshLdapClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) - } - }() - - // Check LDAP server status - CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) - - // Verify management node LDAP configuration - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - - // Verify compute node LDAP configuration - VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - - // Verify login node LDAP configuration - VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) - // Verify LDAP user creation and LSF actions using the new user - VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "tester2", testLogger) + // Validate COS service instance and VPC flow logs + ValidateCosServiceInstanceAndVpcFlowLogs(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationLSFLogs validates the basic cluster configuration @@ -1381,96 +920,63 @@ func ValidateExistingLDAPClusterConfig(t *testing.T, ldapServerBastionIP, ldapSe // - Validate LSF logs by checking the directory structure and symbolic links in the shared folder. // - Reconnect to the master node after reboot and verify job execution. -func ValidateBasicClusterConfigurationLSFLogs(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster details from the options provided for validation - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - - // Parse hyperthreading setting - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing 'hyperthreading_enabled' setting: %v", err) +func ValidateBasicClusterConfigurationLSFLogs(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - // Set job command based on solution and zone - jobCommandLow, _ := SetJobCommands(expectedSolution, expectedZone) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Validate cluster creation - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() - // Retrieve server IPs (handles logic for HPC vs LSF) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error while retrieving server IPs: %v", ipRetrievalError) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Log the start of validation - testLogger.Info(t, t.Name()+" Validation started ......") + // Log validation start + logger.Info(t, t.Name()+" Validation started ......") - // Establish SSH connection to master node - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + // Connect to the master node via SSH and handle connection errors + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Validate management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) // Validate LSF logs: Check if the logs are stored in their correct directory and ensure symbolic links are present - ValidateLSFLogs(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, bastionIP, managementNodeIPList, testLogger) + ValidateLSFLogs(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, bastionIP, managementNodeIPs, logger) // Reconnect to the master node via SSH after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") - - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErr, "Failed to re-establish SSH connection after reboot - check node recovery") - // Wait for dynamic node disappearance + // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Execute job verification - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Retrieve compute node IPs - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - - // Validate compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + // Verify compute node configuration + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) - // Validate login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + // Verify login node configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Log the end of validation - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationWithDedicatedHost validates the basic configuration of a cluster @@ -1478,205 +984,152 @@ func ValidateBasicClusterConfigurationLSFLogs(t *testing.T, options *testhelper. // connectivity between all components are configured correctly. The function performs various // validation tasks including checking cluster details, node configurations, IP retrieval, // and job execution. This function logs all validation steps and errors during the process. -func ValidateBasicClusterConfigurationWithDedicatedHost(t *testing.T, options *testhelper.TestOptions, expectedDedicatedHostPresence bool, testLogger *utils.AggregatedLogger) { +func ValidateBasicClusterConfigurationWithDedicatedHost(t *testing.T, options *testhelper.TestOptions, expectedDedicatedHostPresence bool, logger *utils.AggregatedLogger) { + // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - WorkerNodeMinCount, err := utils.GetTotalWorkerNodeCount(t, options.TerraformVars, testLogger) + expected := GetExpectedClusterConfig(t, options) + WorkerNodeMinCount, err := utils.GetTotalStaticComputeCount(t, options.TerraformVars, logger) require.NoError(t, err, "Error retrieving worker node total count") - expectedZone := options.TerraformVars["zones"].([]string)[0] - - // Retrieve expected DNS domain name for compute nodes - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - // Parse hyperthreading configuration - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, _ := SetJobCommands(expectedSolution, expectedZone) - // Run the test consistency check for cluster creation for cluster creation - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Get the job command for low memory tasks and ignore the other ones + jobCommandLow, _, _ := GenerateLSFJobCommandsForMemoryTypes() // Retrieve IPs for all the required nodes (bastion, management, login, and static worker) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) // Verify dedicated host configuration - ValidateDedicatedHost(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, WorkerNodeMinCount, expectedDedicatedHostPresence, testLogger) + ValidateDedicatedHost(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, WorkerNodeMinCount, expectedDedicatedHostPresence, logger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job to verify job execution on the cluster - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify PTR records for management and login nodes - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify LSF DNS settings on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) // Verify file share encryption configuration - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } -// ValidateBasicClusterConfigurationWithSCC validates the basic cluster configuration. -// It performs validation tasks on essential aspects of the cluster setup, -// including the management node, compute nodes, and login node configurations. -// Additionally, it ensures proper connectivity and functionality. -// This function checks service instance details, extracts relevant GUIDs, and verifies attachments' states. -// Errors and validation steps are logged during the process. -func ValidateBasicClusterConfigurationWithSCC(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] +// ValidateClusterConfigurationWithSCCWPAndCSPM validates the cluster configuration +// with SCCWP and CSPM enabled. +// It performs validation on critical components such as the management node, +// compute nodes, and login node to ensure proper setup and connectivity. +// All validation steps and errors are logged throughout the process. +func ValidateBasicClusterConfigurationWithSCCWPAndCSPM(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - // Retrieve expected DNS domain name for compute nodes - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - // Parse hyperthreading configuration - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - - // Set job commands based on solution type - jobCommandLow, _ := SetJobCommands(expectedSolution, expectedZone) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Run the test consistency check for cluster creation - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Get the job command for low memory tasks and ignore the other ones + jobCommandLow, _, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) // Verify SCC instance - ValidateSCCInstance(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, SCC_INSTANCE_REGION, testLogger) + //ValidateSCCInstance(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, SCC_INSTANCE_REGION, logger) // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() // Run job to verify job execution on the cluster - VerifyJobs(t, sshClient, jobCommandLow, testLogger) + VerifyJobs(t, sshClient, jobCommandLow, logger) // Get compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) if err != nil { t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) } // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + VerifyComputeNodeConfig(t, sshClient, expected.Hyperthreading, computeNodeIPList, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify file share encryption configuration - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationWithCloudLogs validates essential cluster configurations and logs errors. @@ -1685,105 +1138,82 @@ func ValidateBasicClusterConfigurationWithSCC(t *testing.T, options *testhelper. // Errors are handled explicitly, and validation steps are logged for debugging. // Key validation and configuration checks ensure that the cluster setup adheres to standards. -func ValidateBasicClusterConfigurationWithCloudLogs(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] +func ValidateBasicClusterConfigurationWithCloudLogs(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) expectedLogsEnabledForManagement, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_logs_enable_for_management"])) - require.NoError(t, err, "Error parsing observability_logs_enable_for_management") + require.NoError(t, err, "Failed to parse observability_logs_enable_for_management from Terraform vars - check variable type and value") expectedLogsEnabledForCompute, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_logs_enable_for_compute"])) - require.NoError(t, err, "Error parsing observability_logs_enable_for_compute") + require.NoError(t, err, "Failed to parse observability_logs_enable_for_compute from Terraform vars - check variable type and value") - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, expectedLogsEnabledForManagement, false, false, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandMed, testLogger) - - // Get static and dynamic compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Verify that cloud logs are enabled and correctly configured - VerifyCloudLogs(t, sshClient, expectedSolution, options.LastTestTerraformOutputs, managementNodeIPList, staticWorkerNodeIPList, expectedLogsEnabledForManagement, expectedLogsEnabledForCompute, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + VerifyCloudLogs(t, sshClient, options.LastTestTerraformOutputs, managementNodeIPs, staticWorkerNodeIPs, expectedLogsEnabledForManagement, expectedLogsEnabledForCompute, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) - - // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationWithCloudMonitoring validates essential cluster configurations and logs errors. @@ -1792,105 +1222,84 @@ func ValidateBasicClusterConfigurationWithCloudLogs(t *testing.T, options *testh // Errors are handled explicitly, and validation steps are logged for debugging. // Key validation and configuration checks ensure that the cluster setup adheres to standards. -func ValidateBasicClusterConfigurationWithCloudMonitoring(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details from options - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") +func ValidateBasicClusterConfigurationWithCloudMonitoring(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) expectedMonitoringEnabledForManagement, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_monitoring_enable"])) - require.NoError(t, err, "Error parsing observability_monitoring_enable") + require.NoError(t, err, "Failed to parse observability_monitoring_enable from Terraform vars - check variable type and value") expectedMonitoringEnabledForCompute, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_monitoring_on_compute_nodes_enable"])) - require.NoError(t, err, "Error parsing observability_monitoring_on_compute_nodes_enable") + require.NoError(t, err, "Failed to parse observability_monitoring_on_compute_nodes_enable from Terraform vars - check variable type and value") - // Set job commands based on solution type - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Run the test consistency check - clusterCreationErr := ValidateClusterCreation(t, options, testLogger) - if clusterCreationErr != nil { - require.NoError(t, clusterCreationErr, "Cluster creation validation failed: %v") - } + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Retrieve server IPs (different logic for HPC vs LSF solutions) - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, ipRetrievalError := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started ......") + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, expectedMonitoringEnabledForManagement, false, logger) // Connect to the master node via SSH and handle connection errors - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandMed, testLogger) - - // Get static and dynamic compute node IPs and handle errors - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - if err != nil { - t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) - } - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify that cloud monitoring are enabled and correctly configured - VerifyCloudMonitoring(t, sshClient, expectedSolution, options.LastTestTerraformOutputs, managementNodeIPList, staticWorkerNodeIPList, expectedMonitoringEnabledForManagement, expectedMonitoringEnabledForCompute, testLogger) - - // Verify SSH connectivity from login node and handle connection errors - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) - // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + // Verify that cloud monitoring are enabled and correctly configured + VerifyCloudMonitoring(t, sshClient, options.LastTestTerraformOutputs, managementNodeIPs, staticWorkerNodeIPs, expectedMonitoringEnabledForManagement, expectedMonitoringEnabledForCompute, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } // ValidateBasicClusterConfigurationWithCloudAtracker verifies that the cluster setup aligns with the expected configuration @@ -1899,355 +1308,369 @@ func ValidateBasicClusterConfigurationWithCloudMonitoring(t *testing.T, options // The function establishes SSH connections to validate node configurations, runs job verification tests, // checks PTR records, and ensures file share encryption. If any configuration discrepancies are found, // appropriate test errors are raised. -func ValidateBasicClusterConfigurationWithCloudAtracker(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve common cluster details - expectedSolution := strings.ToLower(options.TerraformVars["solution"].(string)) - expectedClusterName, expectedReservationID, expectedMasterName := GetClusterInfo(options) - expectedResourceGroup := options.TerraformVars["existing_resource_group"].(string) - expectedKeyManagement := options.TerraformVars["key_management"].(string) - expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedTargetType := options.TerraformVars["observability_atracker_target_type"].(string) - - expectedObservabilityAtrackerEnable, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_atracker_enable"])) - require.NoError(t, err, "Error parsing observability_atracker_enable") +func ValidateBasicClusterConfigurationWithCloudAtracker(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - require.True(t, ok, "Missing or invalid 'compute' key in dns_domain_name") - - expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) - require.NoError(t, err, "Error parsing hyperthreading_enabled") + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Set job commands - jobCommandLow, jobCommandMed := SetJobCommands(expectedSolution, expectedZone) + expectedTargetType := options.TerraformVars["observability_atracker_target_type"].(string) - // Validate cluster creation - require.NoError(t, ValidateClusterCreation(t, options, testLogger), "Cluster creation validation failed") + expectedObservabilityAtrackerEnable, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_atracker_enable"])) + require.NoError(t, err, "Failed to parse observability_atracker_enable from Terraform vars - check variable type and value") // Retrieve server IPs - bastionIP, managementNodeIPList, loginNodeIP, staticWorkerNodeIPList, err := GetClusterIPs(t, options, expectedSolution, testLogger) - require.NoError(t, err, "Failed to retrieve cluster IPs") + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - testLogger.Info(t, t.Name()+" Validation started ......") + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() - // Establish SSH connection to master node - sshClient, err := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, err, "Failed to connect to the master node via SSH") + // Log validation start + logger.Info(t, t.Name()+" Validation started ......") + + // Connect to the master node via SSH and handle connection errors + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node (%s) via bastion (%s) -> private IP (%s): %v", + LSF_PUBLIC_HOST_NAME, bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) } }() - testLogger.Info(t, "SSH connection to master node successful") - + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, expectedSolution, testLogger) + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("SSH connection to master node via bastion (%s) -> private IP (%s) failed after reboot: %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } - // Ensure dynamic node disappearance check runs after validation + // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job verification - VerifyJobs(t, sshClient, jobCommandMed, testLogger) - - // Get compute node IPs - computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, testLogger, expectedSolution, staticWorkerNodeIPList) - require.NoError(t, err, "Failed to retrieve dynamic compute node IPs") - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) // Validate Atracker ibmCloudAPIKey := os.Getenv("TF_VAR_ibmcloud_api_key") - ValidateAtracker(t, ibmCloudAPIKey, utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedTargetType, expectedObservabilityAtrackerEnable, testLogger) - - // Establish SSH connection to login node - sshLoginNodeClient, err := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, err, "Failed to connect to the login node via SSH") - - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + ValidateAtracker(t, ibmCloudAPIKey, utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expectedTargetType, expectedObservabilityAtrackerEnable, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) // Verify PTR records - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - - // Verify LSF DNS on login node - VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, ibmCloudAPIKey, utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + VerifyFileShareEncryption(t, sshClient, ibmCloudAPIKey, utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" Validation ended") } -// ValidateClusterConfigWithAPPCenterOnExistingEnvironment validates the configuration of an existing cluster with App Center integration. -// It verifies management node configuration, SSH keys, failover and failback, LSF daemon restart, dynamic compute node configuration, -// login node configuration, SSH connectivity, application center configuration, noVNC configuration, PTR records, and file share encryption. -// The function connects to various nodes, performs required actions, and logs results using the provided test logger. -// Parameters include expected values, IP addresses, and configuration settings to ensure the cluster operates correctly with the specified integrations. -func ValidateClusterConfigWithAPPCenterOnExistingEnvironment( - t *testing.T, - computeSshKeysList []string, - bastionIP, loginNodeIP, expectedClusterName, expectedReservationID, expectedMasterName, expectedResourceGroup, - expectedKeyManagement, expectedZone, expectedDnsDomainName string, - managementNodeIPList []string, - expectedHyperthreadingEnabled bool, - testLogger *utils.AggregatedLogger, -) { - - expectedNumOfKeys := len(computeSshKeysList) - - // Retrieve job commands for different levels - jobCommandLow := GetJobCommand(expectedZone, "low") - jobCommandMed := GetJobCommand(expectedZone, "med") +// ValidateBasicObservabilityClusterConfiguration verifies observability features in an HPC LSF cluster. +// It checks log/monitoring enablement, Atracker config, DNS, PTR records, and encryption settings. +// The function connects to management and compute nodes via SSH for validations. +// It ensures dynamic worker nodes disappear as expected after reboot. - // Log validation start - testLogger.Info(t, t.Name()+" Validation started...") +func ValidateBasicObservabilityClusterConfiguration(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Connect to the master node via SSH - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + expectedLogsEnabledForManagement, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_logs_enable_for_management"])) + require.NoError(t, err, "Failed to parse observability_logs_enable_for_management from Terraform vars - check variable type and value") - defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) - } - }() + expectedLogsEnabledForCompute, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_logs_enable_for_compute"])) + require.NoError(t, err, "Failed to parse observability_logs_enable_for_compute from Terraform vars - check variable type and value") - testLogger.Info(t, "SSH connection to the master successful") - t.Log("Validation in progress. Please wait...") + expectedEnabledPlatFormLogs, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_enable_platform_logs"])) + require.NoError(t, err, "Failed to parse observability_enable_platform_logs from Terraform vars - check variable type and value") - // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, "hpc", testLogger) + expectedMonitoringEnabledForManagement, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_monitoring_enable"])) + require.NoError(t, err, "Failed to parse observability_monitoring_enable from Terraform vars - check variable type and value") - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) + expectedMonitoringEnabledForCompute, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_monitoring_on_compute_nodes_enable"])) + require.NoError(t, err, "Failed to parse observability_monitoring_on_compute_nodes_enable from Terraform vars - check variable type and value") - // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) + expectedTargetType := options.TerraformVars["observability_atracker_target_type"].(string) - // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) + expectedObservabilityAtrackerEnable, err := strconv.ParseBool(fmt.Sprintf("%v", options.TerraformVars["observability_atracker_enable"])) + require.NoError(t, err, "Failed to parse observability_atracker_enable from Terraform vars - check variable type and value") - // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Reconnect to the master node via SSH after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") + + // Set job commands for low and medium memory tasks (high memory command skipped) + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() + + logger.Info(t, t.Name()+" validation started") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, expectedLogsEnabledForManagement, expectedMonitoringEnabledForManagement, false, logger) + + // Connect to the master node via SSH and handle connection errors + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + logger.Info(t, fmt.Sprintf("Failed to close SSH client: %v", err)) } }() + logger.Info(t, "SSH connection to the master successful") + t.Log("Validation in progress. Please wait...") + + // Run validations + runClusterValidationsOnManagementNode(t, sshClient, bastionIP, managementNodeIPs, expected, jobCommandMed, logger) + + // Reconnect after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErr, "Failed to re-establish SSH connection after reboot - check node recovery") // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() - // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) - - // Get dynamic compute node IPs - computeNodeIPList, computeIPErr := HPCGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs") - // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + runClusterValidationsOnComputeNode(t, sshClient, bastionIP, staticWorkerNodeIPs, expected, jobCommandLow, logger) - // Verify SSH key for compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + // Observability validations + VerifyCloudLogs(t, sshClient, options.LastTestTerraformOutputs, managementNodeIPs, staticWorkerNodeIPs, expectedLogsEnabledForManagement, expectedLogsEnabledForCompute, logger) - // Verify SSH connectivity from login node - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") + // Monitoring validations + VerifyCloudMonitoring(t, sshClient, options.LastTestTerraformOutputs, managementNodeIPs, staticWorkerNodeIPs, expectedMonitoringEnabledForManagement, expectedMonitoringEnabledForCompute, logger) - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + // Atracker validation + ibmCloudAPIKey := os.Getenv("TF_VAR_ibmcloud_api_key") + ValidateAtracker(t, ibmCloudAPIKey, utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expectedTargetType, expectedObservabilityAtrackerEnable, logger) + + //Platform validation + VerifyPlatformLogs(t, ibmCloudAPIKey, utils.GetRegion(expected.Zones), expected.ResourceGroup, expectedEnabledPlatFormLogs, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) - // Re-fetch dynamic compute node IPs - computeNodeIPList, computeIPErr = HPCGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs") + // PTR and DNS validations + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + // Verify LSF DNS + VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expected.DnsDomainName, logger) - // Verify application center configuration - VerifyAPPCenterConfig(t, sshClient, testLogger) + // Encryption validation + VerifyFileShareEncryption(t, sshClient, ibmCloudAPIKey, utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) - // Verify noVNC configuration - VerifyNoVNCConfig(t, sshClient, testLogger) + logger.Info(t, t.Name()+" validation ended") +} - // Verify PTR records for management and login nodes - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) +// ValidateClusterConfigurationWithMultipleKeys performs a comprehensive validation on the cluster setup. +// It connects to various cluster components via SSH and verifies their configurations and functionality, +// including management nodes, compute nodes, login nodes, and dynamic compute nodes. It also performs +// additional validation checks like failover procedures, SSH key verification, and DNS verification. +// The function logs detailed information throughout the validation process but does not return any value. +func ValidateClusterConfigurationWithMultipleKeys(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { - // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") -} + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") -// ValidateClusterConfigWithAPPCenterAndLDAPOnExistingEnvironment validates the configuration of an existing cluster with App Center and LDAP integration. -// It verifies management node configuration, SSH keys, failover and failback, LSF daemon restart, dynamic compute node configuration, login node configuration, -// SSH connectivity, application center configuration, noVNC configuration, PTR records, file share encryption, and LDAP server configuration and status. -// The function connects to various nodes, performs required actions, and logs results using the provided test logger. -// Parameters include expected values, IP addresses, credentials for validation, and configuration settings. -// This ensures the cluster operates correctly with the specified configurations and integrations, including LDAP. -func ValidateClusterConfigWithAPPCenterAndLDAPOnExistingEnvironment( - t *testing.T, - computeSshKeysList []string, - bastionIP, loginNodeIP, expectedClusterName, expectedReservationID, expectedMasterName, expectedResourceGroup, - expectedKeyManagement, expectedZone, expectedDnsDomainName string, - managementNodeIPList []string, - expectedHyperthreadingEnabled bool, - ldapServerIP, expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword string, - testLogger *utils.AggregatedLogger, -) { - - expectedNumOfKeys := len(computeSshKeysList) - - // Retrieve job commands for different levels - jobCommandLow := GetJobCommand(expectedZone, "low") - jobCommandMed := GetJobCommand(expectedZone, "med") + // Set job commands for low and medium memory tasks, ignoring high memory command + jobCommandLow, jobCommandMed, _ := GenerateLSFJobCommandsForMemoryTypes() // Log validation start - testLogger.Info(t, t.Name()+" Validation started...") + logger.Info(t, t.Name()+" Validation started ......") - // Connect to the master node via SSH - sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to connect to the master via SSH") + // Connect to the management node via SSH + sshClientOne, sshClientTwo, connectionErrOne, connectionErrTwo := utils.ConnectToHostsWithMultipleUsers(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErrOne, "Failed to connect to the master via SSH") + require.NoError(t, connectionErrTwo, "Failed to connect to the master via SSH") defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + if err := sshClientOne.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshClientOne: %v", err)) + } + }() + + defer func() { + if err := sshClientTwo.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshClientTwo: %v", err)) } }() - testLogger.Info(t, "SSH connection to the master successful") + logger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") // Verify management node configuration - VerifyManagementNodeConfig(t, sshClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, EXPECTED_LSF_VERSION, "hpc", testLogger) + VerifyManagementNodeConfig(t, sshClientOne, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) + VerifyManagementNodeConfig(t, sshClientTwo, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) - // Verify SSH key on management nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) + // Verify SSH key on management node + VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPs, expected.NumOfKeys, logger) // Perform failover and failback - FailoverAndFailback(t, sshClient, jobCommandMed, testLogger) + FailoverAndFailback(t, sshClientOne, jobCommandMed, logger) // Restart LSF daemon - RestartLsfDaemon(t, sshClient, testLogger) + RestartLsfDaemon(t, sshClientOne, logger) // Reboot instance - RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], testLogger) + RebootInstance(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0], logger) - // Reconnect to the master node via SSH after reboot - sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH") + // Reconnect to the management node after reboot + sshClientOne, connectionErrOne = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErrOne, "Failed to reconnect to the master via SSH: %v", connectionErrOne) defer func() { - if err := sshClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshClient: %v", err)) + if err := sshClientOne.Close(); err != nil { + logger.Info(t, fmt.Sprintf("failed to close sshClientOne: %v", err)) } }() // Wait for dynamic node disappearance and handle potential errors defer func() { - if err := WaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + if err := WaitForDynamicNodeDisappearance(t, sshClientOne, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) } }() // Run job - VerifyJobs(t, sshClient, jobCommandLow, testLogger) + VerifyJobs(t, sshClientOne, jobCommandLow, logger) - // Get dynamic compute node IPs - computeNodeIPList, err := HPCGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.NoError(t, err, "Error getting dynamic compute node IPs") + // Get compute node IPs and handle errors + computeNodeIPList, err := GetComputeNodeIPs(t, sshClientOne, staticWorkerNodeIPs, logger) + if err != nil { + t.Fatalf("Failed to retrieve dynamic compute node IPs: %v", err) + } // Verify compute node configuration - VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - - // Verify SSH key for compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + VerifyComputeNodeConfig(t, sshClientOne, expected.Hyperthreading, computeNodeIPList, logger) - // Verify SSH connectivity from login node - sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) - require.NoError(t, connectionErr, "Failed to connect to the login node via SSH") + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expected.NumOfKeys, logger) - defer func() { - if err := sshLoginNodeClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLoginNodeClient: %v", err)) - } - }() + // Verify LSF DNS on compute nodes + VerifyLSFDNS(t, sshClientOne, computeNodeIPList, expected.DnsDomainName, logger) // Verify login node configuration - VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterName, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) - // Re-fetch dynamic compute node IPs - computeNodeIPList, connectionErr = HPCGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.NoError(t, connectionErr, "Error getting dynamic compute node IPs") + // Verify LSF DNS on login node + VerifyLSFDNS(t, sshClientOne, []string{loginNodeIP}, expected.DnsDomainName, logger) - // Verify SSH connectivity from login node - VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + // Verify file share encryption + VerifyFileShareEncryption(t, sshClientOne, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) - // Verify application center configuration - VerifyAPPCenterConfig(t, sshClient, testLogger) + // Log validation end + logger.Info(t, t.Name()+" Validation ended") +} - // Verify noVNC configuration - VerifyNoVNCConfig(t, sshClient, testLogger) +// ValidateBasicClusterConfigurationForMultiProfileStaticAndDynamic validates key components of an LSF cluster +// +// with static and dynamic compute node profiles. It checks SSH connectivity, management and compute node setups, +// job execution, and file share encryption. Validation results are logged, and critical issues fail the test. +func ValidateBasicClusterConfigurationForMultiProfileStaticAndDynamic(t *testing.T, options *testhelper.TestOptions, logger *utils.AggregatedLogger) { + // Retrieve common cluster details from options + expected := GetExpectedClusterConfig(t, options) - // Verify PTR records for management and login nodes - VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + // Retrieve server IPs (logic varies for HPC vs. LSF clusters) + bastionIP, managementNodeIPs, loginNodeIP, staticWorkerNodeIPs, getClusterIPErr := GetClusterIPs(t, options, logger) + require.NoError(t, getClusterIPErr, "Failed to get cluster IPs from Terraform outputs - check network configuration") - // Verify file share encryption - VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, managementNodeIPList, testLogger) + deployerIP, getdeployerIPErr := GetDeployerIPs(t, options, logger) + require.NoError(t, getdeployerIPErr, "Failed to get deployer IP from Terraform outputs - check deployer configuration") - // Connect to the LDAP server via SSH and handle connection errors - sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) - require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH") + // Get job command for high memory tasks + jobCommandLow, _, jobCommandHigh := GenerateLSFJobCommandsForMemoryTypes() + + // Log validation start + logger.Info(t, t.Name()+" Validation started ......") + + VerifyTestTerraformOutputs(t, bastionIP, deployerIP, false, false, false, logger) + + // Log validation start + logger.Info(t, t.Name()+" validation started...") + + // Connect to the master node via SSH and handle connection errors + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + if connectionErr != nil { + msg := fmt.Sprintf("Failed to establish SSH connection to master node via bastion (%s) -> private IP (%s): %v", bastionIP, managementNodeIPs[0], connectionErr) + logger.FAIL(t, msg) + require.FailNow(t, msg) + } defer func() { - if err := sshLdapClient.Close(); err != nil { - testLogger.Info(t, fmt.Sprintf("failed to close sshLdapClient: %v", err)) + if err := sshClient.Close(); err != nil { + logger.Info(t, fmt.Sprintf("Failed to close sshClient: %v", err)) } }() - // Check LDAP server status - CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) + logger.Info(t, "SSH connection to the master successful") + t.Log("Validation in progress. Please wait...") - // Verify management node LDAP config - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify management node configuration + VerifyManagementNodeConfig(t, sshClient, expected.MasterName, expected.Hyperthreading, managementNodeIPs, expected.LsfVersion, logger) - // Verify compute node LDAP config - VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Wait for dynamic node disappearance and handle potential errors + defer func() { + if err := WaitForDynamicNodeDisappearance(t, sshClient, logger); err != nil { + logger.Error(t, fmt.Sprintf("Error in WaitForDynamicNodeDisappearance: %v", err)) + t.Errorf("Error in WaitForDynamicNodeDisappearance: %v", err) + } + }() + + // Run job to trigger dynamic node behavior + VerifyJobs(t, sshClient, jobCommandHigh, logger) + + // Verify dynamic node profile + ValidateDynamicNodeProfile(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, options, logger) + + // Get compute node IPs (static + dynamic) + computeNodeIPList, err := GetComputeNodeIPs(t, sshClient, staticWorkerNodeIPs, logger) + if err != nil { + t.Fatalf("Failed to retrieve compute node IPs: %v", err) + } + + // Verify compute node configuration + VerifyComputeNodeConfig(t, sshClient, expected.Hyperthreading, computeNodeIPList, logger) + + // Verify login node configuration + runClusterValidationsOnLoginNode(t, bastionIP, loginNodeIP, expected, managementNodeIPs, staticWorkerNodeIPs, jobCommandLow, logger) + + // Verify PTR records + VerifyPTRRecordsForManagement(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs, expected.DnsDomainName, logger) - // Verify login node LDAP config - VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify file share encryption and key management + VerifyFileShareEncryption(t, sshClient, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expected.Zones), expected.ResourceGroup, expected.MasterName, expected.KeyManagement, managementNodeIPs, logger) // Log validation end - testLogger.Info(t, t.Name()+" Validation ended") + logger.Info(t, t.Name()+" validation ended") } diff --git a/tests/lsf/constants.go b/tests/lsf/constants.go index 9984ec4c..c406a489 100644 --- a/tests/lsf/constants.go +++ b/tests/lsf/constants.go @@ -5,6 +5,7 @@ const ( LSF_PUBLIC_HOST_NAME = "ubuntu" LSF_PRIVATE_HOST_NAME = "lsfadmin" LSF_LDAP_HOST_NAME = "ubuntu" + LSF_DEPLOYER_HOST_NAME = "vpcuser" HYPERTHREADTING_TRUE = true HYPERTHREADTING_FALSE = false LSF_DEFAULT_RESOURCE_GROUP = "Default" @@ -18,15 +19,17 @@ const ( HPC_JOB_COMMAND_MED_MEM_SOUTH = `bsub -J myjob[1-1] -R "select[family=mx3d] rusage[mem=30G]" sleep 90` HPC_JOB_COMMAND_HIGH_MEM_SOUTH = `bsub -J myjob[1-1] -R "select[family=mx3d] rusage[mem=90G]" sleep 90` HPC_JOB_COMMAND_LOW_MEM_WITH_MORE_SLEEP = `bsub -J myjob[1-1] -R "select[family=mx2] rusage[mem=30G]" sleep 90` - LSF_JOB_COMMAND_LOW_MEM = `bsub -n 4 sleep 60` - LSF_JOB_COMMAND_MED_MEM = `bsub -n 6 sleep 90` + LSF_JOB_COMMAND_LOW_MEM = `bsub -n 4 sleep 90` + LSF_JOB_COMMAND_MED_MEM = `bsub -n 6 sleep 120` LSF_JOB_COMMAND_HIGH_MEM = `bsub -n 10 sleep 120` + SHAREDLOGDIRPATH = `/mnt/lsf/logs` + NEW_LDAP_USER_NAME = `Krishna` + NEW_LDAP_USER_PASSWORD = `Pass@1234` // pragma: allowlist secret ) var ( LSF_CUSTOM_RESOURCE_GROUP_OTHER_THAN_DEFAULT = "WES_TEST" - KMS_KEY_INSTANCE_NAME = "cicd-key-instance" - KMS_KEY_NAME = "cicd-key-name" - EXPECTED_LSF_VERSION = "10.1.0.14" + LSF_VERSION_FP14 = "10.1.0.14" + LSF_VERSION_FP15 = "10.1.0.15" SCC_INSTANCE_REGION = "us-south" ) diff --git a/tests/lsf_config.yml b/tests/lsf_config.yml deleted file mode 100644 index 41c1d44c..00000000 --- a/tests/lsf_config.yml +++ /dev/null @@ -1,47 +0,0 @@ -solution: lsf -default_existing_resource_group: Default -non_default_existing_resource_group: HPCC -zone: us-east-2 -cluster_name: HPC-LSF-1 -remote_allowed_ips: -ssh_key: geretain-hpc -login_node_instance_type: bx2-2x8 -management_image_name: "hpc-lsf10-rhel810-v2" -compute_image_name: "hpcaas-lsf10-rhel810-compute-v8" -login_image_name: "hpcaas-lsf10-rhel810-compute-v8" -management_node_instance_type: bx2-2x8 -management_node_count: 2 -worker_node_max_count: 3 -worker_node_instance_type: - - count: 1 - instance_type: "bx2-2x8" - - count: 1 - instance_type: "cx2-2x4" -enable_vpc_flow_logs: false -key_management: key_protect -kms_instance_name: -kms_key_name: -hyperthreading_enabled: true -dns_domain_name: wes.com -enable_app_center: true -app_center_gui_pwd: Pass@123 # pragma: allowlist secret -enable_ldap: true -ldap_basedns: cicd.com -ldap_admin_password: Pass@123 # pragma: allowlist secret -ldap_user_name: tester -ldap_user_password: Pass@123 # pragma: allowlist secret -us_east_zone: us-east-3 -us_east_cluster_name: HPC-LSF-1 -eu_de_zone: eu-de-3 -eu_de_cluster_name: HPC-LSF-2 -us_south_zone: us-south-1 -us_south_cluster_name: HPC-LSF-2 -jp_tok_zone: jp-tok-1 -jp_tok_cluster_name: HPC-LSF-2 -scc_enable: true -scc_event_notification_plan: standard -scc_location: us-south -observability_monitoring_enable: true -observability_monitoring_on_compute_nodes_enable: true -ssh_file_path: /artifacts/.ssh/id_rsa -ssh_file_path_two: /artifacts/.ssh/id_rsa diff --git a/tests/README.md b/tests/lsf_tests/README.md similarity index 53% rename from tests/README.md rename to tests/lsf_tests/README.md index e2b4717a..217c6a88 100644 --- a/tests/README.md +++ b/tests/lsf_tests/README.md @@ -1,10 +1,8 @@ - - -# HPC Automation +# HPC Automation - LSF ## Overview -This repository contains automation tests for High-Performance Computing as a Service (HPCaaS) using the `ibmcloud-terratest-wrapper/testhelper` library and the Terratest framework in Golang. This guide provides instructions for setting up the environment, running tests, and troubleshooting issues. +This repository contains automation tests for High-Performance Computing as a Service (HPCaaS-LSF) using the `ibmcloud-terratest-wrapper/testhelper` library and the Terratest framework in Golang. This guide provides instructions for setting up the environment, running tests, and troubleshooting issues. ## Table of Contents @@ -12,38 +10,46 @@ This repository contains automation tests for High-Performance Computing as a Se 2. [Cloning the Repository](#cloning-the-repository) 3. [Setting Up the Go Project](#setting-up-the-go-project) 4. [Running the Tests](#running-the-tests) - - [Passing Input Parameters](#passing-input-parameters) - - [Updating `test_config.yml`](#updating-test_configyml) - - [Command-Line Overrides](#command-line-overrides) - - [Using Default Parameters](#using-default-parameters) - - [Overriding Parameters](#overriding-parameters) - - [Running Multiple Tests](#running-multiple-tests) + + * [Passing Input Parameters](#passing-input-parameters) + + * [Updating `lsf_config.yml`](#updating-lsf_configyml) + * [Command-Line Overrides](#command-line-overrides) + * [Using Default Parameters](#using-default-parameters) + * [Overriding Parameters](#overriding-parameters) + * [Running Multiple Tests](#running-multiple-tests) 5. [Exporting API Key](#exporting-api-key) 6. [Analyzing Test Results](#analyzing-test-results) - - [Reviewing Test Output](#reviewing-test-output) - - [Viewing Test Output Logs](#viewing-test-output-logs) + + * [Reviewing Test Output](#reviewing-test-output) + * [Viewing Test Output Logs](#viewing-test-output-logs) 7. [Troubleshooting](#troubleshooting) - - [Common Issues](#common-issues) + + * [Common Issues](#common-issues) 8. [Project Structure](#project-structure) 9. [Utilities](#utilities) - - [LSF Utilities](#lsf-utilities) - - [LSF Cluster Test Utilities](#lsf-cluster-test-utilities) - - [Test Validation Utilities](#test-validation-utilities) - - [SSH Utilities](#ssh-utilities) - - [Logger Utilities](#logger-utilities) - - [Common Utilities](#common-utilities) - - [Deploy Utilities](#deploy-utilities) + + * [LSF Utilities](#lsf-utilities) + * [LSF Cluster Test Utilities](#lsf-cluster-test-utilities) + * [Test Validation Utilities](#test-validation-utilities) + * [SSH Utilities](#ssh-utilities) + * [Logger Utilities](#logger-utilities) + * [Common Utilities](#common-utilities) + * [Deploy Utilities](#deploy-utilities) 10. [Acknowledgments](#acknowledgments) +--- + ## Prerequisites Ensure you have the following tools and utilities installed: -- **Go Programming Language**: [Install Go](https://golang.org/doc/install) -- **Git**: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) -- **Terraform**: [Install Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) -- **IBM Cloud CLI**: [Install IBM Cloud CLI](https://cloud.ibm.com/docs/cli?topic=cli-install-ibmcloud-cli) -- **IBM Cloud Plugins**: +* **Go Programming Language**: [Install Go](https://golang.org/doc/install) +* **Git**: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) +* **Terraform**: [Install Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +* **IBM Cloud CLI**: [Install IBM Cloud CLI](https://cloud.ibm.com/docs/cli?topic=cli-install-ibmcloud-cli) +* **IBM Cloud Plugins**: + ```sh ibmcloud plugin install cloud-object-storage ibmcloud plugin install vpc-infrastructure @@ -52,128 +58,161 @@ Ensure you have the following tools and utilities installed: ibmcloud plugin install key-protect -r "IBM Cloud" ``` +--- ## Cloning the Repository Clone the repository to your local machine: + ```sh git clone https://github.ibm.com/workload-eng-services/HPCaaS.git ``` +--- + ## Setting Up the Go Project -Navigate to the project directory: +Navigate to the test directory: + ```sh -cd HPCaaS/tests +cd HPCaaS/tests/ ``` Install project dependencies using Go modules: + ```sh go mod tidy ``` + Initialize Git Submodules: - ```sh - git submodule update --init - ``` +```sh +git submodule update --init +``` + +Navigate to the test directory: -# Running the Tests +```sh +cd HPCaaS/tests +``` -## Passing Input Parameters +--- -### For Solution LSF +## Running the Tests -#### Updating `lsf_config.yml` +### Passing Input Parameters -You can update the `lsf_config.yml` file to provide input parameters. This file contains default values for various parameters used during testing. Modify the values as needed to suit your testing requirements. +#### For Solution LSF -### For Solution HPC +##### Updating `lsf_config.yml` -#### Updating `hpc_config.yml` +You can update the `/tests/data/lsf_config.yml` file to provide input parameters. This file contains default values for various parameters used during testing. Modify the values as needed to suit your testing requirements. -You can update the `hpc_config.yml` file to provide input parameters. This file contains default values for various parameters used during testing. Modify the values as needed to suit your testing requirements. +--- -## Command-Line Overrides +### Command-Line Overrides -If you want to override the values in `lsf_config.yml` or `hpc_config.yml`, you can pass the input parameters through the command line. For example: +If you want to override the values in `lsf_config.yml`, you can pass the input parameters through the command line. For example: ```sh -SSH_KEY=your_ssh_key ZONE=your_zone EXISTING_RESOURCE_GROUP=your_existing_resource_group SOLUTION=your_solution go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME +export TF_VAR_ibmcloud_api_key=your_api_key # pragma: allowlist secret +export TF_VAR_github_token=your_github_token +export LOG_FILE_NAME="filename.json" +SSH_KEYS=your_ssh_key ZONES=your_zone EXISTING_RESOURCE_GROUP=your_existing_resource_group SOLUTION=your_solution go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME ``` Replace placeholders (e.g., `your_ssh_key`, `your_zone`, etc.) with actual values. -## Running a Specific Test +--- -To run a specific test, use the `-run` flag with the test name pattern. For example: +### Running a Specific Test ```sh -SOLUTION=your_solution go test -v -timeout=900m -parallel 10 -run="^TestRunBasic$" | tee -a $LOG_FILE_NAME +export TF_VAR_ibmcloud_api_key=your_api_key # pragma: allowlist secret +export TF_VAR_github_token=your_github_token +export LOG_FILE_NAME="filename.json" +SSH_KEYS=your_ssh_key go test -v -timeout=900m -parallel 10 -run="^TestRunBasic$" | tee -a $LOG_FILE_NAME ``` -This will run only the `TestRunBasic` test. - -## Using Default Parameters +--- -If you prefer to run tests with the default parameter values from the `lsf_config.yml` file, you can simply run: +### Using Default Parameters ```sh -SOLUTION=your_solution go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME +export TF_VAR_ibmcloud_api_key=your_api_key # pragma: allowlist secret +export TF_VAR_github_token=your_github_token +export LOG_FILE_NAME="filename.json" +go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME ``` +--- ### Overriding Parameters -To override default values, pass the required parameters when executing the command. Below are examples for **HPC** and **LSF** solutions: +Example: -#### Example for HPC: ```sh -SOLUTION=hpc SSH_KEY=your_ssh_key ZONE=your_zone EXISTING_RESOURCE_GROUP=your_existing_resource_group RESERVATION_ID=your_reservation_id KMS_INSTANCE_ID=your_kms_instance_id KMS_KEY_NAME=your_kms_key_name IMAGE_NAME=your_image_name CLUSTER_NAME=your_cluster_name DEFAULT_EXITING_RESOURCE_GROUP=your_default_existing_resource_group NON_DEFAULT_EXITING_RESOURCE_GROUP=your_non_default_existing_resource_group LOGIN_NODE_INSTANCE_TYPE=your_login_node_instance_type MANAGEMENT_IMAGE_NAME=your_management_image_name COMPUTE_IMAGE_NAME=your_compute_image_name MANAGEMENT_NODE_INSTANCE_TYPE=your_management_node_instance_type MANAGEMENT_NODE_COUNT=your_management_node_count ENABLE_VPC_FLOW_LOGS=true KEY_MANAGEMENT=enabled KMS_INSTANCE_NAME=your_kms_instance_name HYPERTHREADING_ENABLED=true SSH_FILE_PATH=your_ssh_file_path APP_CENTER_EXISTING_CERTIFICATE_INSTANCE=your_app_center_existing_certificate_instance go test -v -timeout=900m -parallel=4 -run "TestRunBasic"| tee -a $LOG_FILE_NAME +LSF_VERSION=your_lsf_version \ +SSH_KEYS=your_ssh_key \ +ZONES=your_zone \ +IMAGE_NAME=your_image_name \ +DEFAULT_EXISTING_RESOURCE_GROUP=your_default_existing_resource_group \ +NON_DEFAULT_EXISTING_RESOURCE_GROUP=your_non_default_existing_resource_group \ +KMS_KEY_NAME=your_kms_key_name \ +KEY_MANAGEMENT=enabled \ +KMS_INSTANCE_NAME=your_kms_instance_name \ +KMS_INSTANCE_ID=your_kms_instance_id \ +ENABLE_HYPERTHREADING=trueOrFalse \ +SSH_FILE_PATH=your_ssh_file_path \ +REMOTE_ALLOWED_IPS=your_system_ip \ +APP_CENTER_EXISTING_CERTIFICATE_INSTANCE=your_app_center_existing_certificate_instance \ +go test -v -timeout=900m -parallel=4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME ``` -#### Example for LSF: -```sh -SOLUTION=lsf SSH_KEY=your_ssh_key ZONE=your_zone EXISTING_RESOURCE_GROUP=your_default_existing_resource_group WORKER_NODE_MAX_COUNT=your_worker_node_max_count WORKER_NODE_INSTANCE_TYPE=your_worker_node_instance_type KMS_INSTANCE_ID=your_kms_instance_id KMS_KEY_NAME=your_kms_key_name IMAGE_NAME=your_image_name CLUSTER_NAME=your_cluster_name DEFAULT_EXISTING_RESOURCE_GROUP=your_default_existing_resource_group NON_DEFAULT_EXISTING_RESOURCE_GROUP=your_non_default_existing_resource_group LOGIN_NODE_INSTANCE_TYPE=your_login_node_instance_type MANAGEMENT_IMAGE_NAME=your_management_image_name COMPUTE_IMAGE_NAME=your_compute_image_name MANAGEMENT_NODE_INSTANCE_TYPE=your_management_node_instance_type MANAGEMENT_NODE_COUNT=your_management_node_count ENABLE_VPC_FLOW_LOGS=true KEY_MANAGEMENT=enabled KMS_INSTANCE_NAME=your_kms_instance_name HYPERTHREADING_ENABLED=true SSH_FILE_PATH=your_ssh_file_path APP_CENTER_EXISTING_CERTIFICATE_INSTANCE=your_app_center_existing_certificate_instance go test -v -timeout=900m -parallel=4 -run "TestRunBasic" | tee -a $LOG_FILE_NAME -``` - -### Notes: -- Replace placeholders (e.g., `your_ssh_key`, `your_zone`, etc.) with the actual values applicable to your setup. -- Ensure that all required parameters are included for the respective solution type (`HPC` or `LSF`). -- Parameters like `ENABLE_VPC_FLOW_LOGS` and `HYPERTHREADING_ENABLED` can be set as `true` or `false` based on your requirement. -- The parameter EXISTING_CERTIFICATE_INSTANCE should be assigned a value only when running the PAC HA test case. +--- ### Running Multiple Tests -Execute multiple tests simultaneously: ```sh +export TF_VAR_ibmcloud_api_key=your_api_key # pragma: allowlist secret +export TF_VAR_github_token=your_github_token +export LOG_FILE_NAME="filename.json" go test -v -timeout 900m -parallel 10 -run="TestRunDefault|TestRunBasic|TestRunLDAP|TestRunAppCenter" | tee -a $LOG_FILE_NAME ``` +--- + ### Specific Test Files -- `pr_test.go`: Contains tests that are run for any Pull Request (PR) raised. It ensures that changes proposed in a PR do not break existing functionality. -- `other_test.go`: Includes all P0, P1, and P2 test cases, covering all functional testing. It ensures comprehensive testing of all core functionalities. +* `lsf_pr_test.go`: PR validation tests. +* `lsf_e2e_test.go`: Functional test coverage (P0, P1, P2). +* `lsf_negative_test.go`: Negative test validations. -## Exporting API Key +--- +## Exporting API Key -# Before running tests, export your IBM Cloud API key and log file name as environment variables. -export TF_VAR_ibmcloud_api_key="your_api_key" # Replace 'your_api_key' with your actual IBM Cloud API key # pragma: allowlist secret -export LOG_FILE_NAME="your_log_file_name" # Replace 'your_log_file_name' with the desired log file name +```sh +export TF_VAR_ibmcloud_api_key="your_api_key" # pragma: allowlist secret +export LOG_FILE_NAME="your_log_file_name" +``` +--- ## Analyzing Test Results ### Reviewing Test Output -Passing Test Example: +**Pass Example:** + ```sh --- PASS: TestRunHpcBasicExample (514.35s) PASS ok github.com/terraform-ibm-modules/terraform-ibmcloud-hpc 514.369s ``` -Failing Test Example: +**Fail Example:** + ```sh --- FAIL: TestRunHpcBasicExample (663.30s) FAIL @@ -181,58 +220,101 @@ exit status 1 FAIL github.com/terraform-ibm-modules/terraform-ibcloud-hpc 663.323s ``` +--- + ### Viewing Test Output Logs -- **Console Output**: Check the console for immediate test results. -- **Log Files**: Detailed logs are saved in `test_output.log` and custom logs in the `/tests/logs_output` folder. Logs are timestamped for easier tracking (e.g., `log_20XX-MM-DD_HH-MM-SS.log`). +* **Console Output**: Immediate feedback. +* **Log Files**: Saved under `/tests/logs_output`, timestamped. + +--- ## Troubleshooting ### Common Issues -- **Missing Test Directories**: Ensure the project directory structure is correct and that the required directories for your tests are present. -- **Invalid API Key**: Verify that the `TF_VAR_ibmcloud_api_key` environment variable is set correctly.Double-check the key format and permissions. # pragma: allowlist secret -- **Invalid Solution Type**: Ensure that the `SOLUTION` environment variable is correctly defined. If it is misconfigured, tests may not run as expected. -- **Invalid SSH Key**: Confirm that the `SSH_KEY` environment variable is set properly and points to the correct SSH key file used for authentication. -- **Invalid Zone**: Check that the `ZONE` environment variable corresponds to a valid IBM Cloud zone where your resources are located. -- **Remote IP Configuration**: Ensure the `REMOTE_ALLOWED_IPS` environment variable is set to allow connections from the appropriate IP addresses. Update this if necessary. -- **Terraform Initialization**: Make sure Terraform modules and plugins are up-to-date by running `terraform init`. If any modules fail to load, investigate the error messages and ensure correct configuration. -- **Test Output Logs**: Inspect the test output logs carefully for errors and failure messages. Logs often provide useful hints on what went wrong during the test execution. -- **Resource Limitations**: Ensure there are enough resources (e.g., compute power, storage) available in the cloud environment for your tests to run successfully. -- **Network Configuration**: Double-check that your network configuration (e.g., firewall settings, security groups) allows necessary traffic for the tests. +* **Missing Test Directories** +* **Invalid API/SSH Keys** +* **Zone Misconfiguration** +* **Network/Firewall Rules** +* **Insufficient Cloud Resources** +* **Terraform Init Errors** +* **Check `REMOTE_ALLOWED_IPS`** -For additional help, contact the project maintainers. +### Debugging `TF_VAR_github_token` + +```sh +echo $TF_VAR_github_token +export TF_VAR_github_token="your_github_token" +``` + +* Ensure the token includes proper GitHub permissions (`repo`, `workflow`, etc.) + + + + +--- ## Project Structure ``` /root/HPCAAS/tests -├── README.md -├── utilities -│ ├── deployment.go # Deployment-related utility functions -│ ├── fileops.go # File operations utility functions -│ ├── helpers.go # General helper functions -│ ├── logging.go # Logging utility functions -│ ├── resources.go # Resource management utility functions -│ └── ssh.go # SSH utility functions -├── constants.go # Project-wide constants -├── go.mod # Go module definition -├── go.sum # Go module checksum -├── lsf -│ ├── cluster_helpers.go # Helper functions for cluster testing -│ ├── cluster_utils.go # General utilities for cluster operations -│ ├── cluster_validation.go # Validation logic for cluster tests -│ └── constants.go # Constants specific to LSF -├── other_tests.go # Additional test cases -├── pr_tests.go # Pull request-related tests -├── config.yml # Configuration file -└── logs # Directory for log files +│ +├── data/ # Cluster config files +│ ├── lsf_14_config.yml # Input YAML for test setup +│ └── lsf_15_config.yml # Input YAML for test setup +│ +├── deployment/ # Deployment-specific logic +│ └── lsf_deployment.go +│ +├── logs_output/ # Runtime or SSH logs +│ └── output.txt +│ +├── lsf/ # Core logic and cluster operations +│ ├── cluster_helpers.go +│ ├── cluster_utils.go +│ ├── cluster_validation.go +│ └── constants.go +│ +├── utilities/ # Shared utils across modules +│ ├── api_utils.go # IBM Cloud API interaction helpers +│ ├── fileops.go # File read/write utilities +│ ├── helpers.go # Common general-purpose functions +│ ├── logging.go # Centralized logger +│ ├── report.go # HTML/JSON report generation +│ ├── resources.go # Resource-specific helpers +│ └── ssh.go # SSH connection + command execution +│ +├── lsf_tests/ +│ ├── lsf_e2e_test.go # Full end-to-end test +│ ├── lsf_negative_test.go # Negative test scenarios +│ ├── lsf_setup.go +│ ├── lsf_constants.go +│ ├── resource_exemptions.go +│ └── README.md # Instructions for running tests +| +├── go.mod # Go module file +├── go.sum # Go module file +│── pr_test.go # PR-level minimal test ``` +--- + ## Utilities -### LSF Utilities: `lsf_cluster_utils.go` +* `lsf/` - Helper functions for LSF validation. +* `lsf_tests/` - All test cases. +* `deployment/` - Provisioning and teardown logic. +* `utilities/`: + + * `ssh.go` - SSH logic + * `logger.go` - JSON and plain log generation + * `common.go` - Shared test utilities + * `validation.go` - Functional/infra test logic + * `deploy.go` - Cluster-level provisioning + +### LSF Utilities: `cluster_utils.go` - **CheckLSFVersion**: Verify the LSF version. - **LSFCheckSSHKeyForComputeNodes**: Check SSH key for compute nodes. @@ -285,7 +367,7 @@ For additional help, contact the project maintainers. - **HPCGenerateFilePathMap**: Generate a file path map. - **ValidateFlowLogs**: Validate flow logs configuration. -### LSF Cluster Test Utilities: `lsf_cluster_test_utils.go` +### LSF Cluster Test Utilities: `cluster_helpers.go` - **VerifyManagementNodeConfig**: Verify configurations for management nodes. - **VerifySSHKey**: Check if the SSH key is set correctly. diff --git a/tests/constants.go b/tests/lsf_tests/lsf_constants.go similarity index 82% rename from tests/constants.go rename to tests/lsf_tests/lsf_constants.go index 2a6ae87e..a7047cbd 100644 --- a/tests/constants.go +++ b/tests/lsf_tests/lsf_constants.go @@ -16,4 +16,7 @@ const ( CLUSTER_TWO_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.17.0/24" CLUSTER_TWO_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.18.0/24" CLUSTER_TWO_DNS_DOMAIN_NAME = "clustertwo.com" + KMS_KEY_INSTANCE_NAME = "cicd-key-instance" + KMS_KEY_NAME = "cicd-key-name" + APP_CENTER_GUI_PASSWORD = "Admin@1234" // pragma: allowlist secret ) diff --git a/tests/lsf_tests/lsf_e2e_test.go b/tests/lsf_tests/lsf_e2e_test.go new file mode 100644 index 00000000..a99f07d6 --- /dev/null +++ b/tests/lsf_tests/lsf_e2e_test.go @@ -0,0 +1,2233 @@ +package tests + +import ( + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" + deploy "github.com/terraform-ibm-modules/terraform-ibm-hpc/deployment" + lsf "github.com/terraform-ibm-modules/terraform-ibm-hpc/lsf" + utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" +) + +// Constants for better organization +const ( + createVpcTerraformDir = "examples/create_vpc/" // Brand new VPC +) + +// TestMain is the entry point for all tests +func TestMain(m *testing.M) { + + // Load LSF version configuration + productFileName, err := GetLSFVersionConfig() + if err != nil { + log.Fatalf("❌ Failed to get LSF version config: %v", err) + } + + // Load and validate configuration + configFilePath, err := filepath.Abs("../data/" + productFileName) + if err != nil { + log.Fatalf("❌ Failed to resolve config path: %v", err) + } + + if _, err := os.Stat(configFilePath); err != nil { + log.Fatalf("❌ Config file not accessible: %v", err) + } + + if _, err := deploy.GetConfigFromYAML(configFilePath); err != nil { + log.Fatalf("❌ Config load failed: %v", err) + } + log.Printf("✅ Configuration loaded successfully from %s", filepath.Base(configFilePath)) + + // Execute tests + exitCode := m.Run() + + // Generate HTML report if JSON log exists + if jsonFileName, ok := os.LookupEnv("LOG_FILE_NAME"); ok { + if _, err := os.Stat(jsonFileName); err == nil { + results, err := utils.ParseJSONFile(jsonFileName) + if err != nil { + log.Printf("Failed to parse JSON results: %v", err) + } else if err := utils.GenerateHTMLReport(results); err != nil { + log.Printf("Failed to generate HTML report: %v", err) + } + } + } + + os.Exit(exitCode) +} + +// TestRunBasic validates the basic cluster configuration requirements. +// The test ensures proper resource isolation through random prefix generation +// and relies on ValidateBasicClusterConfiguration for resource cleanup. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Required permissions for resource operations +func TestRunBasic(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Test Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + + require.NoError(t, err, "Failed to initialize test options") + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunCustomRGAsNull validates cluster creation with null resource group value. +// Verifies proper handling of empty resource group specification and ensures +// resources are created in the expected default location. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Permissions to create resources in default resource group +func TestRunCustomRGAsNull(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Test Configuration + options, err := setupOptions(t, clusterNamePrefix, terraformDir, LSF_CUSTOM_EXISTING_RESOURCE_GROUP_VALUE_AS_NULL) + require.NoError(t, err, "Failed to initialize test options") + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunCustomRGAsNonDefault validates cluster creation with non-default resource group. +// Ensures proper resource creation in specified resource group and verifies +// all components are correctly provisioned in the custom location. +// +// Prerequisites: +// - Pre-existing non-default resource group +// - Valid environment configuration +// - Proper permissions on target resource group +func TestRunCustomRGAsNonDefault(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + // Test Configuration + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.NonDefaultExistingResourceGroup) + require.NoError(t, err, "Failed to initialize test options") + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunSCCWPAndCSPMEnabledClusterValidation tests basic cluster validation with SCCWP and CSPM enabled. +func TestRunSCCWPAndCSPMEnabledClusterValidation(t *testing.T) { + + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Test Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + // Skip the test if SCC is disabled + if strings.ToLower(envVars.SccWPEnabled) == "false" { + testLogger.Warn(t, fmt.Sprintf("Skipping %s - SCCWP disabled in configuration", t.Name())) + return + } + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Failed to initialize test options") + + // SCCWP Specific Configuration + options.TerraformVars["sccwp_enable"] = envVars.SccWPEnabled + options.TerraformVars["cspm_enabled"] = envVars.CspmEnabled + options.TerraformVars["sccwp_service_plan"] = envVars.SccwpServicePlan + options.TerraformVars["app_config_plan"] = envVars.AppConfigPlan + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithSCCWPAndCSPM(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunNoKMSAndHTOff validates cluster creation without KMS and with hyperthreading disabled. +// Verifies proper cluster operation with these specific configurations. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Permissions to create resources without KMS +func TestRunNoKMSAndHTOff(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + // Test Configuration + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Failed to initialize test options") + + // Special Configuration + options.TerraformVars["enable_cos_integration"] = false + options.TerraformVars["enable_vpc_flow_logs"] = false + options.TerraformVars["key_management"] = "null" + options.TerraformVars["enable_hyperthreading"] = strings.ToLower("false") + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunUsingExistingKMS validates cluster creation with existing Key Protect service instance. +// Verifies proper KMS integration and encryption functionality. +// +// Prerequisites: +// - Valid IBM Cloud API key +// - Permissions to create/delete KMS instances +// - Proper test suite initialization +func TestRunUsingExistingKMSInstanceAndExistingKey(t *testing.T) { + t.Parallel() + + // Initialization + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // KMS Setup + const ( + keyManagementType = "key_protect" + kmsKeyName = KMS_KEY_NAME + ) + + kmsInstanceName := "cicd-" + utils.GenerateRandomString() + apiKey := os.Getenv("TF_VAR_ibmcloud_api_key") + require.NotEmpty(t, apiKey, "IBM Cloud API key must be set") + + region := utils.GetRegion(envVars.Zones) + testLogger.Info(t, fmt.Sprintf("Creating KMS instance: %s in region: %s", kmsInstanceName, region)) + + err = lsf.CreateServiceInstanceAndKmsKey( + t, + apiKey, + region, + envVars.DefaultExistingResourceGroup, + kmsInstanceName, + kmsKeyName, + testLogger, + ) + require.NoError(t, err, "Must create KMS service instance and key") + + // Cleanup KMS instance after test + defer func() { + testLogger.Info(t, fmt.Sprintf("Deleting KMS instance: %s", kmsInstanceName)) + lsf.DeleteServiceInstanceAndAssociatedKeys( + t, + apiKey, + region, + envVars.DefaultExistingResourceGroup, + kmsInstanceName, + testLogger, + ) + }() + + // Prepare Test Options + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Set KMS Terraform Variables + options.TerraformVars["key_management"] = keyManagementType + options.TerraformVars["kms_instance_name"] = kmsInstanceName + options.TerraformVars["kms_key_name"] = kmsKeyName + + // Cluster Teardown Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Final Result + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunUsingExistingKMSInstanceAndWithoutKey validates cluster creation with existing KMS instance but no key. +// Verifies proper handling of KMS instance without specified key. +// +// Prerequisites: +// - Valid IBM Cloud API key +// - Permissions to create/delete KMS instances +// - Proper test suite initialization +func TestRunUsingExistingKMSInstanceAndWithoutKey(t *testing.T) { + t.Parallel() + + // Initialization + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // KMS Setup + kmsInstanceName := "cicd-" + utils.GenerateRandomString() + apiKey := os.Getenv("TF_VAR_ibmcloud_api_key") + require.NotEmpty(t, apiKey, "IBM Cloud API key must be set") + + region := utils.GetRegion(envVars.Zones) + + testLogger.Info(t, fmt.Sprintf("Creating KMS instance: %s", kmsInstanceName)) + err = lsf.CreateServiceInstanceAndKmsKey( + t, + apiKey, + region, + envVars.DefaultExistingResourceGroup, + kmsInstanceName, + KMS_KEY_NAME, + testLogger, + ) + require.NoError(t, err, "Must create KMS service instance and key") + + // Ensure cleanup of KMS instance and keys + defer func() { + testLogger.Info(t, fmt.Sprintf("Deleting KMS instance: %s", kmsInstanceName)) + lsf.DeleteServiceInstanceAndAssociatedKeys( + t, + apiKey, + region, + envVars.DefaultExistingResourceGroup, + kmsInstanceName, + testLogger, + ) + }() + + // Test Options Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // KMS Variables (no key) + options.TerraformVars["key_management"] = "key_protect" + options.TerraformVars["kms_instance_name"] = kmsInstanceName + + // Cluster Teardown Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunWithExistingKMSInstanceAndKeyWithAuthorizationPolicy validates that a cluster can be +// deployed using an existing KMS instance and key, assuming that the IAM authorization +// policy is already in place between the KMS instance and the VPC file share. +// +// Prerequisites: +// - Valid IBM Cloud API key +// - IAM authorization policy already enabled for the KMS instance and VPC file share +// - Proper test suite initialization +func TestRunWithExistingKMSInstanceAndKeyWithAuthorizationPolicy(t *testing.T) { + t.Parallel() + + // Initialization + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // API Key Validation + apiKey := os.Getenv("TF_VAR_ibmcloud_api_key") + require.NotEmpty(t, apiKey, "IBM Cloud API key must be set") + + // Test Options Configuration + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Set KMS-related variables + options.TerraformVars["key_management"] = "key_protect" + options.TerraformVars["kms_instance_name"] = envVars.KMSInstanceName + options.TerraformVars["kms_key_name"] = envVars.KMSKeyName + options.TerraformVars["skip_iam_share_authorization_policy"] = true + options.TerraformVars["skip_iam_block_storage_authorization_policy"] = true + + // Cluster Teardown Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Final Result + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunLSFClusterCreationWithZeroWorkerNodes validates cluster creation with zero worker nodes. +// Verifies proper handling of empty static compute profile and dynamic scaling configuration. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Permissions to create cluster with dynamic scaling +func TestRunLSFClusterCreationWithZeroWorkerNodes(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // Test Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // Cluster Profile Configuration + options.TerraformVars["static_compute_instances"] = []map[string]interface{}{ + { + "profile": "bx2d-4x16", + "count": 0, + "image": envVars.StaticComputeInstancesImage, + }, + } + + options.TerraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "profile": "cx2-2x4", + "count": 1024, + "image": envVars.DynamicComputeInstancesImage, + }, + } + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithDynamicProfile(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunLDAP validates cluster creation with LDAP authentication enabled. +// Verifies proper LDAP configuration and user authentication functionality. +// +// Prerequisites: +// - LDAP enabled in environment configuration +// - Valid LDAP credentials (admin password, username, user password) +// - Proper test suite initialization +func TestRunLDAP(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // Validate LDAP Configuration + require.Equal(t, "true", strings.ToLower(envVars.EnableLdap), "LDAP must be enabled for this test") + require.NotEmpty(t, envVars.LdapAdminPassword, "LDAP admin password must be provided") // pragma: allowlist secret + require.NotEmpty(t, envVars.LdapUserName, "LDAP username must be provided") + require.NotEmpty(t, envVars.LdapUserPassword, "LDAP user password must be provided") // pragma: allowlist secret + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Set LDAP Terraform Variables + options.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) + options.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns + options.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret + options.TerraformVars["ldap_user_name"] = envVars.LdapUserName + options.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword // pragma: allowlist secret + + // Configure Resource Cleanup + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateLDAPClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Final Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunExistingLDAP validates cluster creation with existing LDAP integration. +// Verifies proper configuration of LDAP authentication with an existing LDAP server. +// +// Prerequisites: +// - LDAP enabled in environment configuration +// - Valid LDAP credentials +// - Existing LDAP server configuration +// - Proper test suite initialization +func TestRunExistingLDAP(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // LDAP Validation + require.Equal(t, "true", strings.ToLower(envVars.EnableLdap), "LDAP must be enabled for this test") + require.NotEmpty(t, envVars.LdapAdminPassword, "LDAP admin password must be provided") // pragma: allowlist secret + require.NotEmpty(t, envVars.LdapUserName, "LDAP username must be provided") + require.NotEmpty(t, envVars.LdapUserPassword, "LDAP user password must be provided") // pragma: allowlist secret + + // First Cluster Configuration + + options1, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options for first cluster") + + // First Cluster LDAP Configuration + + options1.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) + options1.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns + options1.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret + options1.TerraformVars["ldap_user_name"] = envVars.LdapUserName + options1.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword // pragma: allowlist secret + //options1.TerraformVars["key_management"] = "null" + + // First Cluster Cleanup + options1.SkipTestTearDown = true + defer options1.TestTearDown() + + // First Cluster Validation + output, err := options1.RunTest() + require.NoError(t, err, "First cluster validation failed") + require.NotNil(t, output, "First cluster validation returned nil output") + + // Retrieve custom resolver ID + customResolverID, err := utils.GetCustomResolverID(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zones), envVars.DefaultExistingResourceGroup, clusterNamePrefix, testLogger) + require.NoError(t, err, "Error retrieving custom resolver ID: %v", err) + + // Retrieve LDAP IP and Bastion IP + ldapIP, err := utils.GetLdapIP(t, options1, testLogger) + require.NoError(t, err, "Error retrieving LDAP IP address: %v", err) + + ldapServerBastionIP, err := utils.GetBastionIP(t, options1, testLogger) + require.NoError(t, err, "Error retrieving LDAP server bastion IP address: %v", err) + + serverCertErr := utils.RetrieveAndUpdateSecurityGroup(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zones), envVars.DefaultExistingResourceGroup, clusterNamePrefix, "10.241.0.0/18", "389", "389", testLogger) + require.NoError(t, serverCertErr, "Failed to retrieve LDAP server certificate via SSH") + + // Second Cluster Configuration + hpcClusterPrefix2 := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix : %s", hpcClusterPrefix2)) + + options2, err := setupOptions(t, hpcClusterPrefix2, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Error setting up test options for the second cluster: %v", err) + + // LDAP Certificate Retrieval + ldapServerCert, serverCertErr := lsf.GetLDAPServerCert(lsf.LSF_PUBLIC_HOST_NAME, ldapServerBastionIP, lsf.LSF_LDAP_HOST_NAME, ldapIP) + require.NoError(t, serverCertErr, "Must retrieve LDAP server certificate") + testLogger.Info(t, fmt.Sprintf("LDAP server certificate : %s ", strings.TrimSpace(ldapServerCert))) + + // Second Cluster LDAP Configuration + options2.TerraformVars["vpc_name"] = options1.TerraformVars["cluster_prefix"].(string) + "-lsf" + options2.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = CLUSTER_TWO_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS + options2.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = CLUSTER_TWO_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS + dnsMap := map[string]string{ + "compute": "comp2.com", + } + dnsJSON, err := json.Marshal(dnsMap) + require.NoError(t, err, "Must convert to JSON string") + + options2.TerraformVars["dns_domain_name"] = string(dnsJSON) + options2.TerraformVars["dns_custom_resolver_id"] = customResolverID + options2.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) + options2.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns + options2.TerraformVars["ldap_server"] = ldapIP + options2.TerraformVars["ldap_server_cert"] = strings.TrimSpace(ldapServerCert) + + // Second Cluster Cleanup + options2.SkipTestTearDown = true + defer options2.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options2, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Second Cluster Validation + validationStart := time.Now() + testLogger.Info(t, "Starting existing LDAP validation for second cluster") + + lsf.ValidateExistingLDAPClusterConfig(t, ldapServerBastionIP, ldapIP, envVars.LdapBaseDns, envVars.LdapAdminPassword, envVars.LdapUserName, envVars.LdapUserPassword, options2, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunCosAndVpcFlowLogs validates cluster creation with COS integration and VPC flow logs enabled. +// Verifies proper configuration of both features and their integration with the cluster. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Permissions to enable COS and VPC flow logs +func TestRunCosAndVpcFlowLogs(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // Setup Test Options + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Terraform Input Variables + options.TerraformVars["enable_cos_integration"] = true + options.TerraformVars["enable_vpc_flow_logs"] = true + + // Skip resource teardown to retain cluster for debugging if needed + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed in %v", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Cluster validation completed in %v", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunLSFLogs validates proper configuration of LSF management logs. +// Verifies log directory structure, symbolic links, and log collection. +// +// Prerequisites: +// - Valid environment configuration +// - Cluster with at least two management nodes +// - Proper test suite initialization +func TestRunLSFLogs(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // Setup Test Options + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Skip resource teardown to allow for post-run inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationLSFLogs(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Cluster validation completed in %v", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunDedicatedHost validates cluster creation with dedicated hosts. +// Verifies proper provisioning and configuration of dedicated host resources. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Permissions to create dedicated hosts +func TestRunDedicatedHost(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + // Setup Test Options + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Dedicated Host and Compute Configuration + + options.TerraformVars["enable_dedicated_host"] = true + options.TerraformVars["static_compute_instances"] = []map[string]interface{}{ + { + "profile": "bx2-2x8", + "count": 1, + "image": envVars.StaticComputeInstancesImage, + }, + } + options.TerraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "profile": "cx2-2x4", + "count": 1024, + "image": envVars.DynamicComputeInstancesImage, + }, + } + + // Skip resource teardown for inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithDedicatedHost(t, options, true, testLogger) + testLogger.Info(t, fmt.Sprintf("Cluster validation completed in %v", time.Since(validationStart))) + + // Test Outcome Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestObservabilityAllFeaturesDisabled verifies cluster creation when all observability features +// (logs, monitoring, Atracker) are disabled. It ensures that the cluster functions correctly +// without any observability configurations. +// +// Prerequisites: +// - Valid environment setup +// - No dependency on observability services +func TestObservabilityAllFeaturesDisabled(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Disable all observability features + options.TerraformVars["observability_enable_platform_logs"] = false + options.TerraformVars["observability_logs_enable_for_management"] = false + options.TerraformVars["observability_logs_enable_for_compute"] = false + options.TerraformVars["observability_monitoring_enable"] = false + options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false + options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" + options.TerraformVars["observability_atracker_enable"] = false + options.TerraformVars["observability_atracker_target_type"] = "cos" + + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + err = lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, err, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicObservabilityClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed in %v", time.Since(validationStart))) + + // Test Outcome Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s passed", t.Name())) + } +} + +// TestObservabilityLogsEnabledForManagementAndCompute validates cluster creation with +// observability logs enabled for both management and compute nodes. +// +// Prerequisites: +// - Valid environment setup +// - Permissions to enable log services +func TestObservabilityLogsEnabledForManagementAndCompute(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Enable logs for management and compute; disable other observability features + options.TerraformVars["observability_logs_enable_for_management"] = true + options.TerraformVars["observability_logs_enable_for_compute"] = true + options.TerraformVars["observability_enable_platform_logs"] = false + options.TerraformVars["observability_monitoring_enable"] = false + options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false + options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" + options.TerraformVars["observability_atracker_enable"] = false + options.TerraformVars["observability_atracker_target_type"] = "cos" + + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + err = lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, err, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithCloudLogs(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed in %v", time.Since(validationStart))) + + // Test Outcome Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s passed", t.Name())) + } +} + +// TestObservabilityMonitoringEnabledForManagementAndCompute validates cluster creation with +// observability monitoring enabled for both management and compute nodes. +// +// Prerequisites: +// - Valid environment setup +// - Permissions to enable monitoring features +func TestObservabilityMonitoringEnabledForManagementAndCompute(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + // Enable monitoring; disable logs and Atracker + options.TerraformVars["observability_logs_enable_for_management"] = false + options.TerraformVars["observability_logs_enable_for_compute"] = false + options.TerraformVars["observability_enable_platform_logs"] = false + options.TerraformVars["observability_monitoring_enable"] = true + options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = true + options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" + options.TerraformVars["observability_atracker_enable"] = false + options.TerraformVars["observability_atracker_target_type"] = "cloudlogs" + + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + err = lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, err, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationWithCloudMonitoring(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed in %v", time.Since(validationStart))) + + // Test Outcome Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s passed", t.Name())) + } +} + +// TestObservabilityAtrackerLoggingMonitoring provisions LSF clusters with full observability configurations, +// including logging, monitoring, and Atracker integration, to verify end-to-end behavior across different targets. +// +// Scenarios covered: +// - Logging and monitoring enabled, Atracker targeting COS +// - Logging and monitoring enabled, Atracker targeting Cloud Logs +// +// Each test validates cluster creation and configuration integrity under the given observability setup. +// Note: Due to Atracker's 1-target-per-region limit, COS and Cloud Logs scenarios are executed sequentially. + +func TestObservabilityAtrackerLoggingMonitoring(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + scenarios := []struct { + name string + logsForManagement bool + logsForCompute bool + platformLogs bool + monitoring bool + monitoringOnCompute bool + atrackerTargetType string + validationFunc func(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) + }{ + + { + name: "Logs_Monitoring_Atracker_COS", + logsForManagement: true, + logsForCompute: true, + platformLogs: false, + monitoring: true, + monitoringOnCompute: true, + atrackerTargetType: "cos", + validationFunc: lsf.ValidateBasicObservabilityClusterConfiguration, + }, + { + name: "Logs_Monitoring_Atracker_CloudLogs", + logsForManagement: true, + logsForCompute: true, + platformLogs: true, + monitoring: true, + monitoringOnCompute: true, + atrackerTargetType: "cloudlogs", + validationFunc: lsf.ValidateBasicObservabilityClusterConfiguration, + }, + } + + for _, sc := range scenarios { + scenario := sc // capture range variable + + t.Run(scenario.name, func(t *testing.T) { + + testLogger.Info(t, fmt.Sprintf("Scenario %s started", scenario.name)) + + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + options.TerraformVars["observability_enable_platform_logs"] = scenario.platformLogs + options.TerraformVars["observability_logs_enable_for_management"] = scenario.logsForManagement + options.TerraformVars["observability_logs_enable_for_compute"] = scenario.logsForCompute + options.TerraformVars["observability_monitoring_enable"] = scenario.monitoring + options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = scenario.monitoringOnCompute + options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" + options.TerraformVars["observability_atracker_enable"] = true + options.TerraformVars["observability_atracker_target_type"] = scenario.atrackerTargetType + options.TerraformVars["zones"] = utils.SplitAndTrim(envVars.AttrackerTestZone, ",") + options.SkipTestTearDown = true + defer options.TestTearDown() + + testLogger.Info(t, fmt.Sprintf("Deploying cluster for: %s", scenario.name)) + err = lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, err, "Cluster creation failed") + + testLogger.Info(t, "Starting validation...") + scenario.validationFunc(t, options, testLogger) + + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Scenario %s failed", scenario.name)) + } else { + testLogger.PASS(t, fmt.Sprintf("Scenario %s passed", scenario.name)) + } + }) + } +} + +// TestObservabilityAtrackerCosAndCloudLogs provisions LSF clusters with different Atracker targets +// (COS and Cloud Logs) and validates basic observability integration. +// +// Each scenario disables logging and monitoring features while testing Atracker routing separately. +// This ensures that Atracker configurations function correctly, even when other observability +// options are turned off. +// +// Scenarios: +// - Atracker targeting COS +// - Atracker targeting Cloud Logs +// +// Note: Atracker route target capacity is limited to 1 per region. These test cases are run in parallel +// to validate coexistence across configurations within that constraint. + +func TestObservabilityAtrackerWithCosAndCloudLogs(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + scenarios := []struct { + name string + logsForManagement bool + logsForCompute bool + platformLogs bool + monitoring bool + monitoringOnCompute bool + atrackerTargetType string + validationFunc func(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) + }{ + { + name: "Atracker_COS_Only", + logsForManagement: false, + logsForCompute: false, + platformLogs: false, + monitoring: false, + monitoringOnCompute: false, + atrackerTargetType: "cos", + validationFunc: lsf.ValidateBasicClusterConfigurationWithCloudAtracker, + }, + { + name: "Atracker_CloudLogs_Only", + logsForManagement: false, + logsForCompute: false, + platformLogs: false, + monitoring: false, + monitoringOnCompute: false, + atrackerTargetType: "cloudlogs", + validationFunc: lsf.ValidateBasicClusterConfigurationWithCloudAtracker, + }, + } + + for _, sc := range scenarios { + scenario := sc // capture range variable + + t.Run(scenario.name, func(t *testing.T) { + t.Parallel() + testLogger.Info(t, fmt.Sprintf("Scenario %s started", scenario.name)) + + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Must initialize valid test options") + + options.TerraformVars["observability_enable_platform_logs"] = scenario.platformLogs + options.TerraformVars["observability_logs_enable_for_management"] = scenario.logsForManagement + options.TerraformVars["observability_logs_enable_for_compute"] = scenario.logsForCompute + options.TerraformVars["observability_monitoring_enable"] = scenario.monitoring + options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = scenario.monitoringOnCompute + options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" + options.TerraformVars["observability_atracker_enable"] = true + options.TerraformVars["observability_atracker_target_type"] = scenario.atrackerTargetType + options.TerraformVars["zones"] = utils.SplitAndTrim(envVars.AttrackerTestZone, ",") + options.SkipTestTearDown = true + defer options.TestTearDown() + + testLogger.Info(t, fmt.Sprintf("Deploying cluster for: %s", scenario.name)) + err = lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, err, "Cluster creation failed") + + testLogger.Info(t, "Starting validation...") + scenario.validationFunc(t, options, testLogger) + + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Scenario %s failed", scenario.name)) + } else { + testLogger.PASS(t, fmt.Sprintf("Scenario %s passed", scenario.name)) + } + }) + } +} + +// ******************** Region Specific Test ***************** + +// TestRunInUsEastRegion validates cluster creation in US East region with b* profile. +// Verifies proper zone configuration and resource deployment in the specified region. +// +// Prerequisites: +// - Valid US East zone configuration +// - Proper test suite initialization +// - Permissions to create resources in US East region +func TestRunInUsEastRegion(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + usEastZone := utils.SplitAndTrim(envVars.USEastZone, ",") + require.NotEmpty(t, usEastZone, "Must provide valid US East zone configuration") + testLogger.DEBUG(t, fmt.Sprintf("Using US East zones: %v", usEastZone)) + + // Test Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // Region-Specific Configuration + options.TerraformVars["zones"] = usEastZone + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunInEuDeRegion validates cluster creation in Frankfurt region with c* profile. +// Verifies proper zone configuration and resource deployment in the specified region. +// +// Prerequisites: +// - Valid EU-DE zone configuration +// - Proper test suite initialization +// - Permissions to create resources in EU-DE region +func TestRunInEuDeRegion(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + euDeZone := utils.SplitAndTrim(envVars.EUDEZone, ",") + require.NotEmpty(t, euDeZone, "Must provide valid Frankfurt zone configuration") + testLogger.DEBUG(t, fmt.Sprintf("Using Frankfurt zones: %v", euDeZone)) + + // Test Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // Region-Specific Configuration + options.TerraformVars["zones"] = euDeZone + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunInUSSouthRegion validates cluster creation in US South region with m* profile. +// Verifies proper zone configuration and resource deployment in the specified region. +// +// Prerequisites: +// - Valid US South zone configuration +// - Proper test suite initialization +// - Permissions to create resources in US South regionfunc TestRunInUSSouthRegion(t *testing.T) { +func TestRunInUSSouthRegion(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + usSouthZone := utils.SplitAndTrim(envVars.USSouthZone, ",") + require.NotEmpty(t, usSouthZone, "Must provide valid US South zone configuration") + testLogger.DEBUG(t, fmt.Sprintf("Using US South zones: %v", usSouthZone)) + + // Test Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // Region-Specific Configuration + options.TerraformVars["zones"] = usSouthZone + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunInJPTokyoRegion validates cluster creation in Japan Tokyo region with m* profile. +// Verifies proper zone configuration and resource deployment in the specified region. +// +// Prerequisites: +// - Valid Japan Tokyo zone configuration +// - Proper test suite initialization +// - Permissions to create resources in Japan Tokyo region +func TestRunInJPTokyoRegion(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Must load valid environment configuration") + + jpTokyoZone := utils.SplitAndTrim(envVars.JPTokZone, ",") + require.NotEmpty(t, jpTokyoZone, "Must provide valid Japan Tokyo zone configuration") + testLogger.DEBUG(t, fmt.Sprintf("Using Japan Tokyo zones: %v", jpTokyoZone)) + + // Test Configuration + options, err := setupOptions( + t, + clusterNamePrefix, // Generate Unique Cluster Prefix + terraformDir, + envVars.DefaultExistingResourceGroup, + ) + require.NoError(t, err, "Must initialize valid test options") + + // Region-Specific Configuration + options.TerraformVars["zones"] = jpTokyoZone + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunCIDRsAsNonDefault validates that a cluster can be deployed using non-default +// VPC and subnet CIDR blocks, ensuring isolation and custom networking flexibility. +func TestRunCIDRsAsNonDefault(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Load Environment Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + // Set Up Test Options + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Failed to initialize test options") + + // Override CIDR blocks with custom values + options.TerraformVars["vpc_cidr"] = "10.243.0.0/18" + options.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = "10.243.0.0/20" + options.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = "10.243.16.0/28" + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunMultipleSSHKeys validates cluster creation with multiple SSH keys configured. +// Verifies proper handling and authentication with multiple SSH keys. +// +// Prerequisites: +// - Valid environment configuration +// - Proper test suite initialization +// - Multiple SSH keys configured in environment +func TestRunMultipleSSHKeys(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Test Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + + require.NoError(t, err, "Failed to initialize test options") + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateClusterConfigurationWithMultipleKeys(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// TestRunMultiProfileStaticAndDynamic validates cluster deployment with multiple static and dynamic +// compute instance profiles to ensure mixed provisioning works as expected. +func TestRunMultiProfileStaticAndDynamic(t *testing.T) { + t.Parallel() + + // Initialization and Setup + setupTestSuite(t) + require.NotNil(t, testLogger, "Test logger must be initialized") + testLogger.Info(t, fmt.Sprintf("Test %s initiated", t.Name())) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Test Configuration + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to load environment configuration") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Failed to initialize test options") + + // Define multiple management instances + options.TerraformVars["management_instances"] = []map[string]interface{}{ + + { + "profile": "bx2d-16x64", + "count": 1, + "image": envVars.ManagementInstancesImage, + }, + { + "profile": "bx2-2x8", + "count": 1, + "image": envVars.ManagementInstancesImage, + }, + } + + // Define multiple static compute instances + options.TerraformVars["static_compute_instances"] = []map[string]interface{}{ + { + "profile": "bx2d-4x16", + "count": 1, + "image": envVars.StaticComputeInstancesImage, + }, + { + "profile": "bx2-2x8", + "count": 2, + "image": envVars.StaticComputeInstancesImage, + }, + } + + // Define multiple dynamic compute instances + options.TerraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "profile": "cx2-2x4", + "count": 10, + "image": envVars.DynamicComputeInstancesImage, + }, + } + + // Resource Cleanup Configuration + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + // Post-deployment Validation + validationStart := time.Now() + lsf.ValidateBasicClusterConfigurationForMultiProfileStaticAndDynamic(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Validation completed (duration: %v)", time.Since(validationStart))) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed — inspect validation logs for details", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// ******************* Existing VPC *************************** + +// TestRunCreateClusterWithExistingVPC as brand new +func TestRunCreateClusterWithExistingVPC(t *testing.T) { + // Parallelize the test to run concurrently with others + t.Parallel() + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + testLogger.Info(t, "Brand new VPC creation initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment + options, err := setupOptionsVPC(t, clusterNamePrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + output, err := options.RunTest() + require.NoError(t, err, "Error running consistency test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + outputs := (options.LastTestTerraformOutputs) + vpcName := outputs["vpc_name"].(string) + + bastionsubnetId, computesubnetIds := utils.GetSubnetIds(outputs) + + RunCreateClusterWithExistingVpcCIDRs(t, vpcName) + RunCreateClusterWithExistingVpcSubnetsNoDns(t, vpcName, bastionsubnetId, computesubnetIds) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// RunCreateClusterWithExistingVpcCIDRs with Cidr blocks +func RunCreateClusterWithExistingVpcCIDRs(t *testing.T, vpcName string) { + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Static values for CIDR other than default CIDR + vpcClusterPrivateSubnetsCidrBlocks := "10.241.32.0/24" + vpcClusterLoginPrivateSubnetsCidrBlocks := "10.241.16.32/28" + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["vpc_name"] = vpcName + options.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = vpcClusterPrivateSubnetsCidrBlocks + options.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = vpcClusterLoginPrivateSubnetsCidrBlocks + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + lsf.ValidateClusterConfiguration(t, options, testLogger) + + testLogger.Info(t, fmt.Sprintf("Finished execution: %s", t.Name())) +} + +// RunCreateClusterWithExistingVpcSubnetsNoDns with compute and login subnet id. Both custom_resolver and dns_instace null +func RunCreateClusterWithExistingVpcSubnetsNoDns(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string) { + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["vpc_name"] = vpcName + options.TerraformVars["login_subnet_id"] = bastionsubnetId + options.TerraformVars["cluster_subnet_id"] = computesubnetIds + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + lsf.ValidateClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Finished execution: %s", t.Name())) +} + +// TestRunCreateVpcWithCustomDns brand new VPC with DNS +func TestRunCreateVpcWithCustomDns(t *testing.T) { + // Parallelize the test to run concurrently with others + t.Parallel() + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment + options, err := setupOptionsVPC(t, clusterNamePrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["enable_hub"] = true + options.TerraformVars["dns_zone_name"] = "lsf.com" + + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + output, err := options.RunTest() + require.NoError(t, err, "Error running consistency test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + outputs := (options.LastTestTerraformOutputs) + vpcName := outputs["vpc_name"].(string) + instanceId, customResolverId := utils.GetDnsCustomResolverIds(outputs) + bastionsubnetId, computesubnetIds := utils.GetSubnetIds(outputs) + + RunCreateClusterWithDnsAndResolver(t, vpcName, bastionsubnetId, computesubnetIds, instanceId, customResolverId) + RunCreateClusterWithOnlyResolver(t, vpcName, bastionsubnetId, computesubnetIds, customResolverId) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// RunCreateClusterWithDnsAndResolver with existing custom_resolver_id and dns_instance_id +func RunCreateClusterWithDnsAndResolver(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string, instanceId string, customResolverId string) { + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["vpc_name"] = vpcName + options.TerraformVars["login_subnet_id"] = bastionsubnetId + options.TerraformVars["cluster_subnet_id"] = computesubnetIds + options.TerraformVars["dns_instance_id"] = instanceId + options.TerraformVars["dns_custom_resolver_id"] = customResolverId + + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + lsf.ValidateClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Finished execution: %s", t.Name())) + +} + +// RunCreateClusterWithOnlyResolver with existing custom_resolver_id and new dns_instance_id +func RunCreateClusterWithOnlyResolver(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string, customResolverId string) { + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["vpc_name"] = vpcName + options.TerraformVars["login_subnet_id"] = bastionsubnetId + options.TerraformVars["cluster_subnet_id"] = computesubnetIds + options.TerraformVars["dns_custom_resolver_id"] = customResolverId + + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + lsf.ValidateClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Finished execution: %s", t.Name())) +} + +// TestRunCreateVpcWithCustomDnsOnlyDNS creates a new VPC and uses custom DNS (DNS-only scenario) +func TestRunCreateVpcWithCustomDnsOnlyDNS(t *testing.T) { + // Parallelize the test to run concurrently with others + t.Parallel() + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment + options, err := setupOptionsVPC(t, clusterNamePrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["enable_hub"] = true + options.TerraformVars["dns_zone_name"] = "lsf.com" + + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + output, err := options.RunTest() + require.NoError(t, err, "Error running consistency test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + outputs := (options.LastTestTerraformOutputs) + instanceId, _ := utils.GetDnsCustomResolverIds(outputs) + + RunCreateClusterWithOnlyDns(t, instanceId) + + // Test Result Evaluation + if t.Failed() { + testLogger.Error(t, fmt.Sprintf("Test %s failed - inspect validation logs", t.Name())) + } else { + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) + } +} + +// RunCreateClusterWithOnlyDns creates a cluster using existing DNS instance (custom_resolver_id = null) +func RunCreateClusterWithOnlyDns(t *testing.T, instanceId string) { + + // Set up the test suite and prepare the testing environment + setupTestSuite(t) + + // Log the initiation of the cluster creation process + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate Unique Cluster Prefix + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + // Get and validate environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Set up the test options with the relevant parameters, including environment variables and resource group + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + options.TerraformVars["dns_instance_id"] = instanceId + + require.NoError(t, err, "Error setting up test options: %v", err) + + // Skip test teardown for further inspection + options.SkipTestTearDown = true + defer options.TestTearDown() + + // Cluster Deployment + deploymentStart := time.Now() + testLogger.Info(t, fmt.Sprintf("Starting cluster deployment for test: %s", t.Name())) + + clusterCreationErr := lsf.VerifyClusterCreationAndConsistency(t, options, testLogger) + require.NoError(t, clusterCreationErr, "Cluster creation validation failed") + testLogger.Info(t, fmt.Sprintf("Cluster deployment completed (duration: %v)", time.Since(deploymentStart))) + + lsf.ValidateClusterConfiguration(t, options, testLogger) + testLogger.Info(t, fmt.Sprintf("Finished execution: %s", t.Name())) +} + +// ******************* Existing VPC *************************** diff --git a/tests/lsf_tests/lsf_negative_test.go b/tests/lsf_tests/lsf_negative_test.go new file mode 100644 index 00000000..5b57aa33 --- /dev/null +++ b/tests/lsf_tests/lsf_negative_test.go @@ -0,0 +1,1198 @@ +package tests + +import ( + "fmt" + "path/filepath" + "strings" + "testing" + + "github.com/gruntwork-io/terratest/modules/terraform" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" +) + +const ( + lsfSolutionPath = "solutions/lsf" + testPathPrefix = "tests/lsf_tests/" + invalidLDAPServerIP = "10.10.10.10" + invalidSubnetCIDR1 = "1.1.1.1/20" + invalidSubnetCIDR2 = "2.2.2.2/20" + invalidSubnetCIDR3 = "3.3.3.3/20" + invalidKMSKeyName = "sample-key" + invalidKMSInstance = "sample-ins" +) + +// getTerraformDirPath returns the absolute path to the LSF solution directory +func getTerraformDirPath(t *testing.T) string { + absPath, err := filepath.Abs(lsfSolutionPath) + require.NoError(t, err, "Failed to get absolute path for LSF solution") + return strings.ReplaceAll(absPath, testPathPrefix, "") +} + +// getBaseVars returns common variables for tests +func getBaseVars(t *testing.T) map[string]interface{} { + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + return map[string]interface{}{ + "cluster_prefix": utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()), + "ssh_keys": utils.SplitAndTrim(envVars.SSHKeys, ","), + "zones": utils.SplitAndTrim(envVars.Zones, ","), + "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), + "app_center_gui_password": APP_CENTER_GUI_PASSWORD, // pragma: allowlist secret + } +} + +// TestInvalidRunLSFWithoutMandatory tests Terraform's behavior when mandatory variables are missing +func TestInvalidRunLSFWithoutMandatory(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + terraformDirPath := getTerraformDirPath(t) + + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: map[string]interface{}{}, + }) + + UpgradeTerraformOnce(t, terraformOptions) + + _, err := terraform.PlanE(t, terraformOptions) + require.Error(t, err, "Expected an error during plan") + + validationPassed := utils.VerifyDataContains(t, err.Error(), "remote_allowed_ips", testLogger) + assert.True(t, validationPassed, "Should fail with missing mandatory variables") + testLogger.LogValidationResult(t, validationPassed, "Missing mandatory variables validation") +} + +// TestInvalidEmptyIbmcloudApiKey validates cluster creation with empty IBM Cloud API key +func TestInvalidEmptyIbmcloudApiKey(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["ibmcloud_api_key"] = "" // Empty API key //pragma: allowlist secret + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "The API key for IBM Cloud must be set", testLogger) + assert.True(t, validationPassed, "Should fail with empty API key error") + testLogger.LogValidationResult(t, validationPassed, "Empty IBM Cloud API key validation") +} + +// TestInvalidLsfVersion validates cluster creation with invalid LSF version +func TestInvalidLsfVersion(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["lsf_version"] = "invalid_version" // Invalid LSF version + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "Invalid LSF version. Allowed values are 'fixpack_14' and 'fixpack_15'", testLogger) + assert.True(t, validationPassed, "Should fail with invalid LSF version error") + testLogger.LogValidationResult(t, validationPassed, "Invalid LSF version validation") +} + +// TestInvalidAppCenterPassword validates cluster creation with invalid App Center password +func TestInvalidAppCenterPassword(t *testing.T) { + t.Parallel() + + invalidPasswords := []string{ + "weak", // Too short + "PasswoRD123", // Contains dictionary word // pragma: allowlist secret + "password123", // All lowercase // pragma: allowlist secret + "Password@", // Missing numbers // pragma: allowlist secret + "Password123", // Common password pattern // pragma: allowlist secret + "password@12345678901234567890", // Too long // pragma: allowlist secret + "ValidPass123\\", //Backslash not in allowed special chars // pragma: allowlist secret + "Pass word@1", //Contains space // pragma: allowlist secret + } + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + for _, password := range invalidPasswords { // pragma: allowlist secret + password := password // create local copy for parallel tests // pragma: allowlist secret + t.Run(password, func(t *testing.T) { // pragma: allowlist secret + t.Parallel() + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["app_center_gui_password"] = password // Invalid password // pragma: allowlist secret + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "The password must be at least 8 characters long", testLogger) // pragma: allowlist secret + assert.True(t, validationPassed, "Should fail with invalid password error") // pragma: allowlist secret + testLogger.LogValidationResult(t, validationPassed, "Invalid App Center password validation") // pragma: allowlist secret + }) + } +} + +// TestInvalidMultipleZones validates cluster creation with multiple zones +func TestInvalidMultipleZones(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["zones"] = []string{"us-east-1", "us-east-2"} // Multiple zones + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "HPC product deployment supports only a single zone", testLogger) + assert.True(t, validationPassed, "Should fail with multiple zones error") + testLogger.LogValidationResult(t, validationPassed, "Multiple zones validation") +} + +// TestInvalidClusterPrefix validates cluster creation with invalid cluster prefix +func TestInvalidClusterPrefix(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["cluster_prefix"] = "--invalid-prefix--" // Invalid prefix + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "Prefix must start with a lowercase letter", testLogger) + assert.True(t, validationPassed, "Should fail with invalid prefix error") + testLogger.LogValidationResult(t, validationPassed, "Invalid cluster prefix validation") +} + +// TestInvalidResourceGroup validates cluster creation with null resource group +func TestInvalidResourceGroup(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["existing_resource_group"] = "Invalid" // Invalid resource group + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "Given Resource Group is not found in the account", testLogger) + assert.True(t, validationPassed, "Should fail with invalid resource group error") + testLogger.LogValidationResult(t, validationPassed, "Invalid resource group validation") +} + +// TestInvalidLoginSubnet validates cluster creation with invalid subnet combination +func TestInvalidLoginSubnet(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["login_subnet_id"] = "subnet-123" // Only providing login subnet + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "In case of existing subnets, provide both login_subnet_id and", testLogger) + assert.True(t, validationPassed, "Should fail with invalid subnet combination error") + testLogger.LogValidationResult(t, validationPassed, "Invalid subnet combination validation") +} + +// TestInvalidDynamicComputeInstances validates cluster creation with multiple dynamic compute instances +func TestInvalidDynamicComputeInstances(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "profile": "bx2-4x16", + "count": 1024, + "image": "hpc-lsf-fp15-compute-rhel810-v1", + }, + { + "profile": "cx2-4x8", + "count": 1024, + "image": "hpc-lsf-fp15-compute-rhel810-v1", + }, + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "Only a single map (one instance profile) is allowed for dynamic compute", testLogger) + assert.True(t, validationPassed, "Should fail with multiple dynamic compute instances error") + testLogger.LogValidationResult(t, validationPassed, "Multiple dynamic compute instances validation") +} + +// TestInvalidKmsKeyName validates cluster creation with KMS key name but no instance name +func TestInvalidKmsKeyName(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["key_management"] = "key_protect" + terraformVars["kms_key_name"] = "my-key" // Key name without instance name + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), "Please make sure you are passing the kms_instance_name", testLogger) + assert.True(t, validationPassed, "Should fail with missing KMS instance name error") + testLogger.LogValidationResult(t, validationPassed, "KMS key name without instance name validation") +} + +// TestInvalidSshKeyFormat validates cluster creation with invalid SSH key format +func TestInvalidSshKeyFormat(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["ssh_keys"] = []string{"invalid-key-with spaces"} // Invalid format + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + assert.True(t, + strings.Contains(err.Error(), "Invalid SSH key") || + strings.Contains(err.Error(), "No SSH Key found"), + "Error should be about SSH key validation. Got: %s", err.Error(), + ) + testLogger.LogValidationResult(t, true, "Invalid SSH key format validation") +} + +// TestInvalidZoneRegionCombination validates invalid zone/region combination +func TestInvalidZoneRegionCombination(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["zones"] = []string{"eu-tok-1"} // Invalid for US region + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + assert.True(t, + strings.Contains(err.Error(), "dial tcp: lookup eu-tok.iaas.cloud.ibm.com: no such host") || + strings.Contains(err.Error(), "invalid zone"), + "Error should be about zone/region mismatch. Got: %s", err.Error(), + ) + testLogger.LogValidationResult(t, true, "Invalid zone/region validation") +} + +// TestExceedManagementNodeLimit validates exceeding management node limit +func TestExceedManagementNodeLimit(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["management_instances"] = []map[string]interface{}{ + { + "count": 11, // Exceeds limit of 10 + "profile": "bx2-16x64", + "image": "hpc-lsf-fp15-rhel810-v1", + }, + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + assert.True(t, + strings.Contains(err.Error(), "must not exceed") || + strings.Contains(err.Error(), "limit of 10"), + "Error should be about management node limit. Got: %s", err.Error(), + ) + testLogger.LogValidationResult(t, true, "Management node limit validation") +} + +// TestInvalidFileShareConfiguration validates invalid file share config +func TestInvalidFileShareConfiguration(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["custom_file_shares"] = []map[string]interface{}{ + { + "mount_path": "/mnt/vpcstorage/tools", + "size": 5, // Below minimum 10GB + "iops": 2000, + }, + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + assert.True(t, + strings.Contains(err.Error(), "must be greater than or equal to 10"), + "Error should be about file share size. Got: %s", err.Error(), + ) + testLogger.LogValidationResult(t, true, "File share size validation") +} + +// TestInvalidDnsDomainName validates invalid DNS domain name +func TestInvalidDnsDomainName(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["dns_domain_name"] = map[string]interface{}{ + "compute": "invalid_domain", // Missing .com + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + assert.True(t, + strings.Contains(err.Error(), "must be a valid FQDN"), + "Error should be about DNS domain format. Got: %s", err.Error(), + ) + testLogger.LogValidationResult(t, true, "DNS domain validation") +} + +// TestInvalidLdapServerIP validates cluster creation with invalid LDAP server IP +func TestInvalidLdapServerIP(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Retrieve environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + if strings.ToLower(envVars.EnableLdap) != "true" { + t.Skip("LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to run this test.") + } + + // Validate required LDAP credentials + if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { // pragma: allowlist secret + t.Fatal("LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") + } + + // Get base Terraform variables + + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + + // Set invalid LDAP server configuration + terraformVars["enable_ldap"] = true + terraformVars["ldap_server"] = "10.10.10.10" // Invalid IP + terraformVars["ldap_server_cert"] = "SampleTest" + terraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + output, err := terraform.InitAndApplyE(t, terraformOptions) + + require.Error(t, err, "Expected an error during apply") + validationPassed := utils.VerifyDataContains(t, output, "Failed to connect to LDAP server at 10.10.10.10", testLogger) + assert.True(t, validationPassed, "Should fail with invalid LDAP server IP error") + testLogger.LogValidationResult(t, validationPassed, "Invalid LDAP server IP validation") + + defer terraform.Destroy(t, terraformOptions) +} + +// TestInvalidLdapServerCert validates cluster creation with invalid LDAP server certificate +func TestInvalidLdapServerCert(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Retrieve environment variables + envVars, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + if strings.ToLower(envVars.EnableLdap) != "true" { + t.Skip("LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to run this test.") + } + + // Validate required LDAP credentials + if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { // pragma: allowlist secret + t.Fatal("LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") + } + + // Get base Terraform variables + + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + + // Set invalid LDAP server certificate configuration + terraformVars["enable_ldap"] = true + terraformVars["ldap_server"] = "10.10.10.10" // Existing server + terraformVars["ldap_server_cert"] = "" // Missing certificate + terraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err = terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, err.Error(), + "Provide the current LDAP server certificate. This is required if", + testLogger) && utils.VerifyDataContains(t, err.Error(), + "'ldap_server' is set; otherwise, the LDAP configuration will not succeed.", + testLogger) + + assert.True(t, validationPassed, "Should fail with missing LDAP server certificate error") + testLogger.LogValidationResult(t, validationPassed, "Invalid LDAP server certificate validation") + +} + +// TestInvalidLdapConfigurations validates various invalid LDAP configurations +func TestInvalidLdapConfigurations(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + config map[string]interface{} + expectedErrors []string + description string + }{ + // Username validation tests + { + name: "UsernameWithSpace", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "invalid user", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "LDAP username must be between 4-32 characters", + "can only contain letters, numbers, hyphens, and underscores", + "Spaces are not permitted.", + }, + description: "Username containing space should fail", + }, + { + name: "UsernameTooShort", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "usr", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "LDAP username must be between 4-32 characters long and can only contain", + "letters, numbers, hyphens, and underscores", + }, + description: "Username shorter than 4 characters should fail", + }, + { + name: "UsernameTooLong", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "thisusernameiswaytoolongandshouldfailvalidation", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "LDAP username must be between 4-32 characters long and can only contain", + "letters, numbers, hyphens, and underscores", + }, + description: "Username longer than 32 characters should fail", + }, + { + name: "UsernameWithSpecialChars", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "user@name#", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "LDAP username must be between 4-32 characters long and can only contain", + "letters, numbers, hyphens, and underscores. Spaces are not permitted.", + }, + description: "Username with special characters should fail", + }, + + // Password validation tests + { + name: "PasswordTooShort", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "Short1!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "must be 8 to 20 characters long", + }, + description: "Password shorter than 8 characters should fail", + }, + { + name: "PasswordTooLong", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "ThisPasswordIsWayTooLong123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "must be 8 to 20 characters long", + }, + description: "Password longer than 20 characters should fail", + }, + { + name: "PasswordMissingUppercase", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "missingupper1!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "two alphabetic characters (with one uppercase and one lowercase)", + }, + description: "Password missing uppercase letter should fail", + }, + { + name: "PasswordMissingLowercase", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "MISSINGLOWER1!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "two alphabetic characters (with one uppercase and one lowercase)", + }, + description: "Password missing lowercase letter should fail", + }, + { + name: "PasswordMissingNumber", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "MissingNumber!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "two alphabetic characters (with one uppercase and one lowercase), one", + "number, and one special character", + }, + description: "Password missing number should fail", + }, + { + name: "PasswordMissingSpecialChar", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "MissingSpecial1", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "one special character", + }, + description: "Password missing special character should fail", + }, + { + name: "PasswordWithSpace", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "Invalid Pass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "password must not contain the username or any spaces", + }, + description: "Password containing space should fail", + }, + { + name: "PasswordContainsUsername", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "Validuser", + "ldap_user_password": "Validuser123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "password must not contain the username or any spaces", + }, + description: "Password containing username should fail", + }, + + // Admin password validation tests + { + name: "AdminPasswordMissing", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "", // pragma: allowlist secret + }, + expectedErrors: []string{ + "The LDAP admin password must be 8 to 20 characters long and include", + "least two alphabetic characters (with one uppercase and one lowercase)", + }, + description: "Missing admin password should fail", + }, + { + name: "AdminPasswordTooShort", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_user_name": "validuser", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "Short1!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "must be 8 to 20 characters long", + }, + description: "Admin password too short should fail", + }, + + // Base DNS validation + { + name: "MissingBaseDNS", + config: map[string]interface{}{ + "enable_ldap": true, + "ldap_basedns": "", + "ldap_user_name": "validuser", + "ldap_user_password": "ValidPass123!", // pragma: allowlist secret + "ldap_admin_password": "AdminPass123!", // pragma: allowlist secret + }, + expectedErrors: []string{ + "If LDAP is enabled, then the base DNS should not be empty or null.", + }, + description: "Missing base DNS should fail", + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Test: "+t.Name()) + + // Get base vars and merge with test case config + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + for k, v := range tc.config { + terraformVars[k] = v + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + _, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan for: "+tc.description) + + // Check if any of the expected error messages are present + var found bool + errorMsg := err.Error() + for _, expectedErr := range tc.expectedErrors { + if strings.Contains(errorMsg, expectedErr) { + found = true + break + } + } + + assert.True(t, found, + "Expected error containing one of: %v\nBut got: %s", + tc.expectedErrors, errorMsg) + + testLogger.LogValidationResult(t, found, tc.description) + }) + } +} + +// TestInvalidDeployerImage validates invalid deployer image configurations +func TestInvalidDeployerImage(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Starting negative tests for deployer image validation") + + invalidCases := []struct { + name string + lsfVersion string + deployerImage string + expectedError string + }{ + { + name: "FP14_LSF_with_FP15_Image", + lsfVersion: "fixpack_14", + deployerImage: "hpc-lsf-fp15-deployer-rhel810-v1", + expectedError: "Mismatch between deployer_instance.image and lsf_version", + }, + { + name: "FP15_LSF_with_FP14_Image", + lsfVersion: "fixpack_15", + deployerImage: "hpc-lsf-fp14-deployer-rhel810-v1", + expectedError: "Mismatch between deployer_instance.image and lsf_version", + }, + { + name: "Malformed_Image_Name", + lsfVersion: "fixpack_15", + deployerImage: "custom-fp15-image", + expectedError: "Invalid deployer image. Allowed values", + }, + { + name: "Empty_Image_Name", + lsfVersion: "fixpack_15", + deployerImage: "", + expectedError: "Invalid deployer image", + }, + { + name: "Unsupported_FP13_Deployer", + lsfVersion: "fixpack_13", + deployerImage: "hpc-lsf-fp13-deployer-rhel810-v1", + expectedError: "Invalid LSF version. Allowed values are 'fixpack_14' and 'fixpack_15'", + }, + } + + for _, tc := range invalidCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["lsf_version"] = tc.lsfVersion + terraformVars["deployer_instance"] = map[string]interface{}{ + "image": tc.deployerImage, + "profile": "bx2-8x32", + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + _, err := terraform.PlanE(t, terraformOptions) + require.Error(t, err, "Expected '%s' to fail but it passed", tc.name) + assert.Contains(t, err.Error(), tc.expectedError, + "Expected error message mismatch for case: %s", tc.name) + testLogger.Info(t, fmt.Sprintf("Correctly blocked invalid case: %s", tc.name)) + }) + } +} + +// TestInvalidSshKeys validates cluster creation with invalid ssh keys +func TestInvalidSshKeys(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Retrieve environment variables + _, err := GetEnvVars() + require.NoError(t, err, "Failed to get environment variables") + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + + // Test cases for invalid SSH keys + testCases := []struct { + name string + key string + errorPhrase string + }{ + { + name: "Empty SSH key", + key: "", + errorPhrase: "No SSH Key found with name", + }, + { + name: "Invalid key format", + key: "invalid@key", + errorPhrase: "No SSH Key found with name", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + terraformVars["ssh_keys"] = []string{tc.key} + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + output, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, output, + tc.errorPhrase, + testLogger) + + assert.True(t, validationPassed, fmt.Sprintf("Should fail with %s", tc.name)) + testLogger.LogValidationResult(t, validationPassed, fmt.Sprintf("%s validation", tc.name)) + }) + } +} + +// TestInvalidRemoteAllowedIP validates cluster creation with invalid remote allowed IP +func TestInvalidRemoteAllowedIP(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + terraformVars["remote_allowed_ips"] = []string{""} + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + UpgradeTerraformOnce(t, terraformOptions) + output, err := terraform.PlanE(t, terraformOptions) + + require.Error(t, err, "Expected an error during plan") + validationPassed := utils.VerifyDataContains(t, output, + "The provided IP address format is not valid", + testLogger) + + assert.True(t, validationPassed, "Should fail with invalid SSH keys and IP error") + testLogger.LogValidationResult(t, validationPassed, "Invalid SSH keys and remote allowed IP validation") +} + +// TestInvalidInstanceProfiles validates invalid instance profile configurations +func TestInvalidInstanceProfiles(t *testing.T) { + t.Parallel() + + setupTestSuite(t) + testLogger.Info(t, "Starting negative tests for instance profile validation") + + invalidCases := []struct { + name string + bastionProfile string + deployerProfile string + loginProfile string + mgmtProfile string + staticProfile string + dynamicProfile string + expectedError string + }{ + { + name: "Invalid_Bastion_Profile_Format", + bastionProfile: "cx2-invalid", + deployerProfile: "bx2-8x32", + loginProfile: "bx2-2x8", + mgmtProfile: "bx2-16x64", + staticProfile: "bx2-4x16", + dynamicProfile: "bx2-4x16", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Invalid_Deployer_Profile_Format", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-invalid", + loginProfile: "bx2-2x8", + mgmtProfile: "bx2-16x64", + staticProfile: "bx2-4x16", + dynamicProfile: "bx2-4x16", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Invalid_Login_Profile_Format", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-8x32", + loginProfile: "invalid-login-profile", + mgmtProfile: "bx2-16x64", + staticProfile: "bx2-4x16", + dynamicProfile: "bx2-4x16", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Invalid_Management_Profile_Format", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-8x32", + loginProfile: "bx2-2x8", + mgmtProfile: "mgmt-invalid-format", + staticProfile: "bx2-4x16", + dynamicProfile: "bx2-4x16", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Invalid_Static_Compute_Profile_Format", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-8x32", + loginProfile: "bx2-2x8", + mgmtProfile: "bx2-16x64", + staticProfile: "static-invalid", + dynamicProfile: "bx2-4x16", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Invalid_Dynamic_Compute_Profile_Format", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-8x32", + loginProfile: "bx2-2x8", + mgmtProfile: "bx2-16x64", + staticProfile: "bx2-4x16", + dynamicProfile: "dynamic-invalid", + expectedError: "The profile must be a valid virtual server instance profile", + }, + { + name: "Multiple_Dynamic_Compute_Profiles", + bastionProfile: "cx2-4x8", + deployerProfile: "bx2-8x32", + loginProfile: "bx2-2x8", + mgmtProfile: "bx2-16x64", + staticProfile: "bx2-4x16", + dynamicProfile: "bx2-4x16", + expectedError: "Only a single map (one instance profile) is allowed for dynamic compute", + }, + } + + for _, tc := range invalidCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + // Get base Terraform variables + terraformVars := getBaseVars(t) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", terraformVars["cluster_prefix"])) + + // Set instance profiles + terraformVars["bastion_instance"] = map[string]interface{}{ + "image": "ibm-ubuntu-22-04-5-minimal-amd64-3", + "profile": tc.bastionProfile, + } + terraformVars["deployer_instance"] = map[string]interface{}{ + "image": "hpc-lsf-fp15-deployer-rhel810-v1", + "profile": tc.deployerProfile, + } + terraformVars["login_instance"] = []map[string]interface{}{ + { + "image": "hpc-lsf-fp15-compute-rhel810-v1", + "profile": tc.loginProfile, + }, + } + terraformVars["management_instances"] = []map[string]interface{}{ + { + "image": "hpc-lsf-fp15-rhel810-v1", + "profile": tc.mgmtProfile, + "count": 2, + }, + } + terraformVars["static_compute_instances"] = []map[string]interface{}{ + { + "image": "hpc-lsf-fp15-compute-rhel810-v1", + "profile": tc.staticProfile, + "count": 1, + }, + } + + // Special case for multiple dynamic compute profiles + if tc.name == "Multiple_Dynamic_Compute_Profiles" { + terraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "image": "hpc-lsf-fp15-compute-rhel810-v1", + "profile": "bx2-4x16", + "count": 512, + }, + { + "image": "hpc-lsf-fp15-compute-rhel810-v1", + "profile": "bx2-8x32", + "count": 512, + }, + } + } else { + terraformVars["dynamic_compute_instances"] = []map[string]interface{}{ + { + "image": "hpc-lsf-fp15-compute-rhel810-v1", + "profile": tc.dynamicProfile, + "count": 1024, + }, + } + } + + terraformDirPath := getTerraformDirPath(t) + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: terraformDirPath, + Vars: terraformVars, + }) + + _, err := terraform.PlanE(t, terraformOptions) + require.Error(t, err, "Expected '%s' to fail but it passed", tc.name) + assert.Contains(t, err.Error(), tc.expectedError, + "Expected error message mismatch for case: %s", tc.name) + testLogger.Info(t, fmt.Sprintf("Correctly blocked invalid case: %s", tc.name)) + }) + } +} diff --git a/tests/lsf_tests/lsf_setup.go b/tests/lsf_tests/lsf_setup.go new file mode 100644 index 00000000..62c2022d --- /dev/null +++ b/tests/lsf_tests/lsf_setup.go @@ -0,0 +1,423 @@ +package tests + +import ( + "fmt" + "log" + "os" + "reflect" + "strings" + "sync" + "testing" + "time" + + "github.com/gruntwork-io/terratest/modules/terraform" + "github.com/stretchr/testify/require" + "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" + + utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" +) + +// Constants for configuration +const ( + // Terraform solution directory + terraformDir = "solutions/lsf" + + // Default scheduler + Solution = "lsf" + + // Configuration files for each LSF version + lsfFP14ConfigFile = "lsf_fp14_config.yml" + lsfFP15ConfigFile = "lsf_fp15_config.yml" + defaultConfigFile = lsfFP15ConfigFile // Use latest as default + + // Log file suffixes + defaultLogFileSuffix = ".log" + defaultJSONLogFileSuffix = ".json" +) + +// Constants for LSF version normalization +const ( + DefaultLSFVersion = "fixpack_15" + LSF14 = "fixpack_14" + LSF15 = "fixpack_15" +) + +// EnvVars represents all environment variables required for the test +// Fields with `required:"true"` tag must be set for tests to run +type EnvVars struct { + Scheduler string + DefaultExistingResourceGroup string + NonDefaultExistingResourceGroup string + Zones string `required:"true"` + ClusterName string `required:"true"` + RemoteAllowedIPs string `required:"true"` + SSHKeys string `required:"true"` + LoginNodeInstanceType string + LoginNodeImageName string + ManagementInstances string + DeployerInstance string + EnableVPCFlowLogs string + KeyManagement string + KMSInstanceName string + KMSKeyName string + EnableHyperthreading string + DnsDomainName string + EnableAppCenter string + AppCenterGuiPassword string + EnableLdap string + LdapBaseDns string + LdapServer string + LdapAdminPassword string + LdapUserName string + LdapUserPassword string + LdapInstance string + USEastZone string + USEastClusterName string + USEastReservationID string + JPTokZone string + JPTokClusterName string + JPTokReservationID string + EUDEZone string + EUDEClusterName string + EUDEReservationID string + USSouthZone string + USSouthClusterName string + USSouthReservationID string + SSHFilePath string + SSHFilePathTwo string + WorkerNodeMaxCount string + StaticComputeInstances string + DynamicComputeInstances string + SccWPEnabled string + CspmEnabled string + SccwpServicePlan string + AppConfigPlan string + ObservabilityMonitoringEnable string + ObservabilityMonitoringOnComputeNodesEnable string + ObservabilityAtrackerEnable string + ObservabilityAtrackerTargetType string + ObservabilityLogsEnableForManagement string + ObservabilityLogsEnableForCompute string + ObservabilityEnablePlatformLogs string + ObservabilityEnableMetricsRouting string + ObservabilityLogsRetentionPeriod string + ObservabilityMonitoringPlan string + EnableCosIntegration string + CustomFileShares string + BastionInstance string + ManagementInstancesImage string + StaticComputeInstancesImage string + DynamicComputeInstancesImage string + LsfVersion string + LoginInstance string + AttrackerTestZone string +} + +func GetEnvVars() (*EnvVars, error) { + vars := &EnvVars{ + Scheduler: os.Getenv("SCHEDULER"), + DefaultExistingResourceGroup: os.Getenv("DEFAULT_EXISTING_RESOURCE_GROUP"), + NonDefaultExistingResourceGroup: os.Getenv("NON_DEFAULT_EXISTING_RESOURCE_GROUP"), + Zones: os.Getenv("ZONES"), + ClusterName: os.Getenv("CLUSTER_NAME"), + RemoteAllowedIPs: os.Getenv("REMOTE_ALLOWED_IPS"), + SSHKeys: os.Getenv("SSH_KEYS"), + ManagementInstances: os.Getenv("MANAGEMENT_INSTANCES"), + DeployerInstance: os.Getenv("DEPLOYER_INSTANCE"), + BastionInstance: os.Getenv("BASTION_INSTANCE"), + EnableVPCFlowLogs: os.Getenv("ENABLE_VPC_FLOW_LOGS"), + KeyManagement: os.Getenv("KEY_MANAGEMENT"), + KMSInstanceName: os.Getenv("KMS_INSTANCE_NAME"), + KMSKeyName: os.Getenv("KMS_KEY_NAME"), + EnableHyperthreading: os.Getenv("ENABLE_HYPERTHREADING"), + DnsDomainName: os.Getenv("DNS_DOMAIN_NAME"), + AppCenterGuiPassword: os.Getenv("APP_CENTER_GUI_PASSWORD"), + EnableLdap: os.Getenv("ENABLE_LDAP"), + LdapBaseDns: os.Getenv("LDAP_BASEDNS"), + LdapServer: os.Getenv("LDAP_SERVER"), + LdapAdminPassword: os.Getenv("LDAP_ADMIN_PASSWORD"), + LdapUserName: os.Getenv("LDAP_USER_NAME"), + LdapUserPassword: os.Getenv("LDAP_USER_PASSWORD"), + LdapInstance: os.Getenv("LDAP_INSTANCE"), + USEastZone: os.Getenv("US_EAST_ZONE"), + USEastClusterName: os.Getenv("US_EAST_CLUSTER_NAME"), + USEastReservationID: os.Getenv("US_EAST_RESERVATION_ID"), + JPTokZone: os.Getenv("JP_TOK_ZONE"), + JPTokReservationID: os.Getenv("JP_TOK_RESERVATION_ID"), + JPTokClusterName: os.Getenv("JP_TOK_CLUSTER_NAME"), + EUDEZone: os.Getenv("EU_DE_ZONE"), + EUDEClusterName: os.Getenv("EU_DE_CLUSTER_NAME"), + EUDEReservationID: os.Getenv("EU_DE_RESERVATION_ID"), + USSouthZone: os.Getenv("US_SOUTH_ZONE"), + USSouthReservationID: os.Getenv("US_SOUTH_RESERVATION_ID"), + USSouthClusterName: os.Getenv("US_SOUTH_CLUSTER_NAME"), + SSHFilePath: os.Getenv("SSH_FILE_PATH"), + SSHFilePathTwo: os.Getenv("SSH_FILE_PATH_TWO"), + WorkerNodeMaxCount: os.Getenv("WORKER_NODE_MAX_COUNT"), + StaticComputeInstances: os.Getenv("STATIC_COMPUTE_INSTANCES"), + DynamicComputeInstances: os.Getenv("DYNAMIC_COMPUTE_INSTANCES"), + SccWPEnabled: os.Getenv("SCCWP_ENABLED"), + CspmEnabled: os.Getenv("CSPM_ENABLED"), + SccwpServicePlan: os.Getenv("SCCWP_SERVICE_PLAN"), + AppConfigPlan: os.Getenv("APP_CONFIG_PLAN"), + ObservabilityMonitoringEnable: os.Getenv("OBSERVABILITY_MONITORING_ENABLE"), + ObservabilityMonitoringOnComputeNodesEnable: os.Getenv("OBSERVABILITY_MONITORING_ON_COMPUTE_NODES_ENABLE"), + ObservabilityAtrackerEnable: os.Getenv("OBSERVABILITY_ATRACKER_ENABLE"), + ObservabilityAtrackerTargetType: os.Getenv("OBSERVABILITY_ATRACKER_TARGET_TYPE"), + ObservabilityLogsEnableForManagement: os.Getenv("OBSERVABILITY_LOGS_ENABLE_FOR_MANAGEMENT"), + ObservabilityLogsEnableForCompute: os.Getenv("OBSERVABILITY_LOGS_ENABLE_FOR_COMPUTE"), + ObservabilityEnablePlatformLogs: os.Getenv("OBSERVABILITY_ENABLE_PLATFORM_LOGS"), + ObservabilityEnableMetricsRouting: os.Getenv("OBSERVABILITY_ENABLE_METRICS_ROUTING"), + ObservabilityLogsRetentionPeriod: os.Getenv("OBSERVABILITY_LOGS_RETENTION_PERIOD"), + ObservabilityMonitoringPlan: os.Getenv("OBSERVABILITY_MONITORING_PLAN"), + EnableCosIntegration: os.Getenv("ENABLE_COS_INTEGRATION"), + CustomFileShares: os.Getenv("CUSTOM_FILE_SHARES"), + ManagementInstancesImage: os.Getenv("MANAGEMENT_INSTANCES_IMAGE"), + StaticComputeInstancesImage: os.Getenv("STATIC_COMPUTE_INSTANCES_IMAGE"), + DynamicComputeInstancesImage: os.Getenv("DYNAMIC_COMPUTE_INSTANCES_IMAGE"), + LsfVersion: os.Getenv("LSF_VERSION"), + LoginInstance: os.Getenv("LOGIN_INSTANCE"), + AttrackerTestZone: os.Getenv("ATTRACKER_TEST_ZONE"), + } + + // Validate required fields + v := reflect.ValueOf(vars).Elem() + t := v.Type() + for i := 0; i < v.NumField(); i++ { + field := t.Field(i) + if tag, ok := field.Tag.Lookup("required"); ok && tag == "true" { + fieldValue := v.Field(i).String() + if fieldValue == "" { + return nil, fmt.Errorf("missing required environment variable: %s", field.Name) + } + } + } + + return vars, nil +} + +var ( + // testLogger stores the logger instance for logging test messages. + testLogger *utils.AggregatedLogger + + // once ensures that the test suite initialization logic (e.g., logger setup) runs only once, + // even when called concurrently by multiple test functions. + once sync.Once +) + +func setupTestSuite(t *testing.T) { + once.Do(func() { + timestamp := time.Now().Format("2006-01-02_15-04-05") + var logFileName string + + if validationLogFilePrefix, ok := os.LookupEnv("LOG_FILE_NAME"); ok { + fileName := strings.Split(validationLogFilePrefix, defaultJSONLogFileSuffix)[0] + logFileName = fmt.Sprintf("%s%s", fileName, defaultLogFileSuffix) + } else { + logFileName = fmt.Sprintf("%s%s", timestamp, defaultLogFileSuffix) + } + + _ = os.Setenv("LOG_FILE_NAME", fmt.Sprintf("%s%s", strings.Split(logFileName, ".")[0], defaultJSONLogFileSuffix)) + + var err error + testLogger, err = utils.NewAggregatedLogger(logFileName) + if err != nil { + t.Fatalf("Error initializing logger: %v", err) + } + testLogger.Info(t, "Logger initialized successfully") + }) +} + +var upgradeOnce sync.Once // Ensures upgrade is performed only once + +func UpgradeTerraformOnce(t *testing.T, terraformOptions *terraform.Options) { + upgradeOnce.Do(func() { + testLogger.Info(t, "Running Terraform upgrade with `-upgrade=true`...") + + // Run terraform upgrade command + output, err := terraform.RunTerraformCommandE(t, terraformOptions, "init", "-upgrade=true") + if err != nil { + // Log the Terraform upgrade output in case of any failures + testLogger.FAIL(t, fmt.Sprintf("Terraform upgrade failed: %v", err)) + testLogger.FAIL(t, fmt.Sprintf("Terraform upgrade output:\n%s", output)) + require.NoError(t, err, "Terraform upgrade failed") + } + testLogger.PASS(t, "Terraform upgrade completed successfully") + }) +} + +// checkRequiredEnvVars verifies that required environment variables are set. +// Returns an error if any required env var is missing. +func checkRequiredEnvVars() error { + required := []string{"TF_VAR_ibmcloud_api_key", "ZONES", "REMOTE_ALLOWED_IPS", "SSH_KEYS"} + + for _, envVar := range required { + if os.Getenv(envVar) == "" { + return fmt.Errorf("environment variable %s is not set", envVar) + } + } + return nil +} + +// setupOptionsVPC creates a test options object with the given parameters to creating brand new vpc +func setupOptionsVPC(t *testing.T, clusterNamePrefix, terraformDir, existingResourceGroup string) (*testhelper.TestOptions, error) { + + if err := checkRequiredEnvVars(); err != nil { + // Handle missing environment variable error + return nil, err + } + + // Retrieve environment variables + envVars, err := GetEnvVars() + if err != nil { + return nil, fmt.Errorf("environment configuration failed (check required vars): %w", err) + } + + // Create test options + options := &testhelper.TestOptions{ + Testing: t, + TerraformDir: terraformDir, + IgnoreDestroys: testhelper.Exemptions{List: LSFIgnoreLists.Destroys}, + IgnoreUpdates: testhelper.Exemptions{List: LSFIgnoreLists.Updates}, + TerraformVars: map[string]interface{}{ + "cluster_prefix": clusterNamePrefix, + "zones": utils.SplitAndTrim(envVars.Zones, ","), + "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), + "existing_resource_group": existingResourceGroup, + "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKeys, ","), + }, + } + return options, nil +} + +// setupOptions creates a test options object with the given parameters. +func setupOptions(t *testing.T, clusterNamePrefix, terraformDir, existingResourceGroup string) (*testhelper.TestOptions, error) { + + if err := checkRequiredEnvVars(); err != nil { + // Handle missing environment variable error + return nil, err + } + + envVars, err := GetEnvVars() + if err != nil { + return nil, fmt.Errorf("failed to get environment variables: %v", err) + } + + options := &testhelper.TestOptions{ + Testing: t, + TerraformDir: terraformDir, + IgnoreDestroys: testhelper.Exemptions{List: LSFIgnoreLists.Destroys}, + IgnoreUpdates: testhelper.Exemptions{List: LSFIgnoreLists.Updates}, + TerraformVars: map[string]interface{}{ + "cluster_prefix": clusterNamePrefix, + "ssh_keys": utils.SplitAndTrim(envVars.SSHKeys, ","), + "zones": utils.SplitAndTrim(envVars.Zones, ","), + "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), + "existing_resource_group": existingResourceGroup, + "deployer_instance": envVars.DeployerInstance, + "login_instance": envVars.LoginInstance, + "management_instances": envVars.ManagementInstances, + "key_management": envVars.KeyManagement, + "enable_hyperthreading": strings.ToLower(envVars.EnableHyperthreading), + "observability_atracker_enable": false, + "observability_monitoring_enable": false, + "dns_domain_name": envVars.DnsDomainName, + "static_compute_instances": envVars.StaticComputeInstances, + "dynamic_compute_instances": envVars.DynamicComputeInstances, + "bastion_instance": envVars.BastionInstance, + "sccwp_enable": false, + "cspm_enabled": false, + "custom_file_shares": envVars.CustomFileShares, + "enable_cos_integration": false, + "enable_vpc_flow_logs": false, + "app_center_gui_password": envVars.AppCenterGuiPassword, // pragma: allowlist secret + "lsf_version": envVars.LsfVersion, + }, + } + + // Remove empty values from TerraformVars + for key, value := range options.TerraformVars { + if value == "" { + delete(options.TerraformVars, key) + } + } + + return options, nil +} + +// GetLSFVersionConfig determines the correct config YAML file based on the LSF_VERSION +// environment variable. It accepts multiple aliases for convenience (e.g., "14", "lsf14", "fixpack_14"), +// normalizes them to standard constants, and returns the matching config file name. +func GetLSFVersionConfig() (string, error) { + // Step 1: Set default version + lsfVersion := DefaultLSFVersion + var productFileName string + + // Step 2: Check for environment override + if envVersion, ok := os.LookupEnv("LSF_VERSION"); ok { + lsfVersion = strings.ToLower(envVersion) // Normalize user input + } + + // Step 3: Normalize aliases and map to config file + switch lsfVersion { + case "fixpack_14", "lsf14", "14": + productFileName = lsfFP14ConfigFile + lsfVersion = LSF14 // Normalize for consistent internal use + case "fixpack_15", "lsf15", "15": + productFileName = lsfFP15ConfigFile + lsfVersion = LSF15 + default: + return "", fmt.Errorf("unsupported LSF version: %s (supported: fixpack_14, fixpack_15, lsf14, lsf15, 14, 15)", lsfVersion) + } + + // Step 4: Ensure normalized value is set in environment + if err := os.Setenv("LSF_VERSION", lsfVersion); err != nil { + return "", fmt.Errorf("failed to set normalized LSF_VERSION: %w", err) + } + + log.Printf("✅ Using LSF_VERSION: %s", lsfVersion) + return productFileName, nil +} + +// DefaultTest validates creation and verification of an HPC cluster +// Tests: +// - Successful cluster provisioning +// - Valid output structure +// - Resource cleanup + +func DefaultTest(t *testing.T) { + + // 1. Initialization + setupTestSuite(t) + if testLogger == nil { + t.Fatal("Logger initialization failed") + } + testLogger.Info(t, fmt.Sprintf("Test %s starting execution", t.Name())) + + // 2. Configuration + clusterNamePrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) + testLogger.Info(t, fmt.Sprintf("Generated cluster prefix: %s", clusterNamePrefix)) + + envVars, err := GetEnvVars() + if err != nil { + testLogger.Error(t, fmt.Sprintf("Environment config error: %v", err)) + } + require.NoError(t, err, "Environment configuration failed") + + options, err := setupOptions(t, clusterNamePrefix, terraformDir, envVars.DefaultExistingResourceGroup) + if err != nil { + testLogger.Error(t, fmt.Sprintf("Test setup error: %v", err)) + } + require.NoError(t, err, "Test options initialization failed") + + // 3. Execution & Validation + output, err := options.RunTestConsistency() + if err != nil { + testLogger.FAIL(t, fmt.Sprintf("Provisioning failed: %v", err)) + } + require.NoError(t, err, "Cluster provisioning failed with output: %v", output) + require.NotNil(t, output, "Received nil output from provisioning") + + // 4. Completion + testLogger.PASS(t, fmt.Sprintf("Test %s completed successfully", t.Name())) +} diff --git a/tests/lsf_tests/resource_exemptions.go b/tests/lsf_tests/resource_exemptions.go new file mode 100644 index 00000000..b6f080c5 --- /dev/null +++ b/tests/lsf_tests/resource_exemptions.go @@ -0,0 +1,57 @@ +package tests + +// ResourceExemptions contains lists of resources to ignore during Terraform operations +type ResourceExemptions struct { + Destroys []string // Resources to ignore during destroy operations + Updates []string // Resources to ignore during update operations +} + +// LSFIgnoreLists contains the standard resource exemptions for LSF cluster tests +var LSFIgnoreLists = ResourceExemptions{ + Destroys: []string{ + // Null resources used for provisioning checks + "module.landing_zone_vsi.module.hpc.module.check_cluster_status.null_resource.remote_exec[0]", + "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[0]", + "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[1]", + "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[2]", + "module.check_node_status.null_resource.remote_exec[0]", + "module.check_node_status.null_resource.remote_exec[1]", + "module.check_node_status.null_resource.remote_exec[2]", + "module.check_cluster_status.null_resource.remote_exec[0]", + + // Boot waiting resources + "module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]", + "module.landing_zone_vsi.module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", + "module.landing_zone_vsi[0].module.wait_management_vsi_booted.null_resource.remote_exec[0]", + "module.landing_zone_vsi[0].module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", + "module.landing_zone_vsi[0].module.wait_management_candidate_vsi_booted.null_resource.remote_exec[1]", + "module.landing_zone_vsi[0].module.wait_worker_vsi_booted[0].null_resource.remote_exec[0]", + "module.landing_zone_vsi[0].module.wait_worker_vsi_booted[0].null_resource.remote_exec[1]", + + // Configuration resources + "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[0]", + "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[1]", + "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_new_file[0]", + "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_new_file[0]", + "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_run[0]", + "module.landing_zone_vsi[0].module.do_management_vsi_configuration.null_resource.remote_exec_script_run[0]", + + // Other temporary resources + "module.lsf.module.resource_provisioner.null_resource.tf_resource_provisioner[0]", + "module.landing_zone_vsi[0].module.lsf_entitlement[0].null_resource.remote_exec[0]", + "module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", + "module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]", + "module.lsf.module.prepare_tf_input.local_sensitive_file.prepare_tf_input[0]", + "module.compute_playbook[0].null_resource.run_playbook[0]", + }, + + Updates: []string{ + // File storage resources that can be updated without cluster impact + "module.file_storage.ibm_is_share.share[0]", + "module.file_storage.ibm_is_share.share[1]", + "module.file_storage.ibm_is_share.share[2]", + "module.file_storage.ibm_is_share.share[3]", + "module.file_storage.ibm_is_share.share[4]", + "module.lsf.module.prepare_tf_input.local_sensitive_file.prepare_tf_input[0]", + }, +} diff --git a/tests/other_test.go b/tests/other_test.go deleted file mode 100644 index 71327755..00000000 --- a/tests/other_test.go +++ /dev/null @@ -1,2984 +0,0 @@ -package tests - -import ( - "fmt" - "os" - "path/filepath" - "strings" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/gruntwork-io/terratest/modules/terraform" - - "github.com/stretchr/testify/assert" - lsf "github.com/terraform-ibm-modules/terraform-ibm-hpc/lsf" - utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" -) - -// Constants for better organization -const ( - createVpcTerraformDir = "examples/create_vpc/solutions/hpc" // Brand new VPC -) - -// TestRunBasic validates the cluster configuration. -func TestRunBasic(t *testing.T) { - - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct or not - lsf.ValidateClusterConfiguration(t, options, testLogger) - -} - -// TestRunCustomRGAsNull validates cluster creation with a null resource group value. -func TestRunCustomRGAsNull(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, LSF_CUSTOM_EXISTING_RESOURCE_GROUP_VALUE_AS_NULL, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct or not - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - -} - -// TestRunCustomRGAsNonDefault validates cluster creation with a non-default resource group value. -func TestRunCustomRGAsNonDefault(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.NonDefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct or not - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - -} - -// TestRunAppCenter validates cluster creation with the Application Center. -func TestRunAppCenter(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - options.TerraformVars["enable_app_center"] = strings.ToLower(envVars.EnableAppCenter) - options.TerraformVars["app_center_gui_pwd"] = envVars.AppCenterGuiPassword //pragma: allowlist secret - - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct or not - lsf.ValidateClusterConfigurationWithAPPCenter(t, options, testLogger) - -} - -// TestRunSCC validates cluster creation with the SCC. -func TestRunSCCEnabled(t *testing.T) { - // Run the test in parallel - t.Parallel() - - // Set up the test suite - setupTestSuite(t) - - // Generate a random prefix for the HPC cluster - hpcClusterPrefix := utils.GenerateRandomString() - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve environment variables for cluster configuration - envVars := GetEnvVars() - - if strings.ToLower(envVars.sccEnabled) == "false" { - testLogger.Warn(t, fmt.Sprintf("%s will skip execution as the SCC enabled value in the %s_config.yml file is set to true", t.Name(), envVars.sccEnabled)) - return - } - - // Configure test options - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.NonDefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Enable SCC and set the event notification plan - options.TerraformVars["scc_enable"] = envVars.sccEnabled - options.TerraformVars["scc_event_notification_plan"] = envVars.sccEventNotificationPlan - options.TerraformVars["scc_location"] = envVars.sccLocation - options.TerraformVars["existing_resource_group"] = envVars.NonDefaultExistingResourceGroup - - // Skip test teardown; defer teardown to the end of the test - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration with SCC - lsf.ValidateBasicClusterConfigurationWithSCC(t, options, testLogger) -} - -// TestRunPacHa validates the creation and configuration of an cluster with the Application Center -// in high-availability mode, ensuring that all required environment variables and configurations are set. -func TestRunPacHa(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Setup the test suite - setupTestSuite(t) - - // Generate a unique HPC cluster prefix - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve the environment variable for app_center_existing_certificate_instance - existingCertInstance, ok := os.LookupEnv("APP_CENTER_EXISTING_CERTIFICATE_INSTANCE") - if !ok { - t.Fatal("When 'app_center_existing_certificate_instance' is set to true, the environment variable 'APP_CENTER_EXISTING_CERTIFICATE_INSTANCE' must be exported: export APP_CENTER_EXISTING_CERTIFICATE_INSTANCE=value") - } - - testLogger.Info(t, "Cluster creation process initiated for test: "+t.Name()) - - // Retrieve environment variables - envVars := GetEnvVars() - - // Configure test options - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set Terraform variables - options.TerraformVars["management_node_count"] = 3 - options.TerraformVars["enable_app_center"] = strings.ToLower(envVars.EnableAppCenter) - options.TerraformVars["app_center_gui_pwd"] = envVars.AppCenterGuiPassword // pragma: allowlist secret - options.TerraformVars["app_center_high_availability"] = true // pragma: allowlist secret - options.TerraformVars["app_center_existing_certificate_instance"] = existingCertInstance - - // Skip teardown if specified - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate cluster configuration with PACHA - lsf.ValidateClusterConfigurationWithPACHA(t, options, testLogger) -} - -// TestRunNoKMSAndHTOff validates cluster creation with KMS set to null and hyperthreading disabled. -func TestRunNoKMSAndHTOff(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - options.TerraformVars["enable_cos_integration"] = false - options.TerraformVars["enable_vpc_flow_logs"] = false - options.TerraformVars["key_management"] = "null" - options.TerraformVars["hyperthreading_enabled"] = strings.ToLower("false") - - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) -} - -// TestRunLSFClusterCreationWithZeroWorkerNodes validates the cluster creation process -// for the LSF solution when the minimum worker node count is set to zero for both static profile counts. -func TestRunLSFClusterCreationWithZeroWorkerNodes(t *testing.T) { - // Allow the test to run concurrently with others. - t.Parallel() - - // Set up the test suite environment. - setupTestSuite(t) - testLogger.Info(t, "Initiating cluster creation process for "+t.Name()) - - // Generate a unique prefix for the HPC cluster. - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables for the test. - envVars := GetEnvVars() - - // Validate and apply LSF-specific configurations if the solution is LSF. - if envVars.Solution == "lsf" { - // Set up Terraform options. - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up Terraform options: %v", err) - - // Configure the lower profile and the minimum worker node count for the cluster. - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 0, - "instance_type": "bx2-2x8", - }, - { - "count": 0, - "instance_type": "cx2-2x4", - }, - } - // Skip automatic teardown for further inspection post-test. - options.SkipTestTearDown = true - defer options.TestTearDown() - - //Validate the basic cluster configuration. - lsf.ValidateBasicClusterConfigurationWithDynamicProfile(t, options, testLogger) - testLogger.Info(t, "Cluster configuration validation completed successfully.") - } else { - testLogger.Warn(t, "Test skipped as the solution is not 'lsf'.") - t.Skip("This test is applicable only for the 'lsf' solution.") - } -} - -// TestRunInUsEastRegion validates the cluster creation process in the US East region using the b* profile. -func TestRunInUsEastRegion(t *testing.T) { - // Allow the test to run concurrently with others. - t.Parallel() - - // Set up the test suite environment. - setupTestSuite(t) - testLogger.Info(t, "Starting cluster creation process for test: "+t.Name()) - - // Generate a unique prefix for the HPC cluster. - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve and validate environment variables. - envVars := GetEnvVars() - usEastZone := utils.SplitAndTrim(envVars.USEastZone, ",") - require.NotEmpty(t, usEastZone, "Environment variable USEastZone must be provided and contain valid values.") - testLogger.Info(t, fmt.Sprintf("Validated US East zone configuration: %v", usEastZone)) - - // Declare variables for solution-specific configurations. - var usEastClusterName, usEastReservationID string - - // Apply configurations based on the solution type. - if envVars.Solution == "HPC" { - usEastClusterName = envVars.USEastClusterName - usEastReservationID = envVars.USEastReservationID - - // Validate HPC-specific configurations. - require.NotEmpty(t, usEastClusterName, "Environment variable USEastClusterName is required for the HPC solution.") - require.NotEmpty(t, usEastReservationID, "Environment variable USEastReservationID is required for the HPC solution.") - testLogger.Info(t, fmt.Sprintf("HPC-specific configuration validated for US East: Cluster ID - %s, Reservation ID - %s", usEastClusterName, usEastReservationID)) - } - - // Set up Terraform options. - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up Terraform options.") - - // Assign solution-specific Terraform variables. - switch envVars.Solution { - case "HPC": - options.TerraformVars["zones"] = usEastZone - options.TerraformVars["reservation_id"] = usEastReservationID - options.TerraformVars["cluster_name"] = usEastClusterName - testLogger.Info(t, "Terraform variables configured for HPC solution.") - case "lsf": - options.TerraformVars["zones"] = usEastZone - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 2, - "instance_type": "bx2-2x8", - }, - { - "count": 0, - "instance_type": "cx2-2x4", - }, - } - testLogger.Info(t, "Terraform variables configured for LSF solution.") - } - - // Skip automatic teardown for further inspection post-test. - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration. - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - testLogger.Info(t, "Cluster configuration validation completed successfully.") -} - -// TestRunInEuDeRegion validates the cluster creation process in the Frankfurt region using the c* profile. -func TestRunInEuDeRegion(t *testing.T) { - // Allow the test to run concurrently with others. - t.Parallel() - - // Set up the test suite environment. - setupTestSuite(t) - testLogger.Info(t, "Initiating cluster creation process for test: "+t.Name()) - - // Generate a unique prefix for the HPC cluster. - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve and validate environment variables. - envVars := GetEnvVars() - euDeZone := utils.SplitAndTrim(envVars.EUDEZone, ",") - require.NotEmpty(t, euDeZone, "Frankfurt zone configuration must be provided.") - testLogger.Info(t, fmt.Sprintf("Frankfurt zone configuration validated: %s", euDeZone)) - - // Declare variables for solution-specific configurations. - var euDeClusterName, euDeReservationID string - - // Configure based on the solution type. - if envVars.Solution == "HPC" { - euDeClusterName = envVars.EUDEClusterName - euDeReservationID = envVars.EUDEReservationID - - require.NotEmpty(t, euDeClusterName, "Cluster ID for Frankfurt region must be provided in environment variables.") - require.NotEmpty(t, euDeReservationID, "Reservation ID for Frankfurt region must be provided in environment variables.") - testLogger.Info(t, fmt.Sprintf("HPC-specific configuration validated for Frankfurt: Cluster ID - %s, Reservation ID - %s", euDeClusterName, euDeReservationID)) - } - - // Set up Terraform options. - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up Terraform options.") - - // Assign solution-specific Terraform variables. - switch envVars.Solution { - case "HPC": - options.TerraformVars["zones"] = euDeZone - options.TerraformVars["reservation_id"] = euDeReservationID - options.TerraformVars["cluster_name"] = euDeClusterName - testLogger.Info(t, "Terraform variables configured for HPC in Frankfurt.") - case "lsf": - options.TerraformVars["zones"] = euDeZone - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 2, - "instance_type": "cx2-2x4", - }, - { - "count": 0, - "instance_type": "bx2-2x8", - }, - } - - testLogger.Info(t, "Terraform variables configured for LSF in Frankfurt.") - } - - // Skip automatic teardown for further inspection post-test. - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration. - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - testLogger.Info(t, "Cluster configuration validation completed successfully.") -} - -// TestRunInUSSouthRegion validates the cluster creation process in the US South region using the m* profile. -func TestRunInUSSouthRegion(t *testing.T) { - // Allow the test to run concurrently with others. - t.Parallel() - - // Set up the test suite environment. - setupTestSuite(t) - testLogger.Info(t, "Initiating cluster creation process for test: "+t.Name()) - - // Generate a unique prefix for the HPC cluster. - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve and validate environment variables. - envVars := GetEnvVars() - usSouthZone := utils.SplitAndTrim(envVars.USSouthZone, ",") - require.NotEmpty(t, usSouthZone, "US South zone configuration must be provided.") - testLogger.Info(t, fmt.Sprintf("US South zone configuration validated: %s", usSouthZone)) - - // Declare variables for solution-specific configurations. - var usSouthClusterName, usSouthReservationID string - - // Configure based on the solution type. - if envVars.Solution == "HPC" { - usSouthClusterName = envVars.USSouthClusterName - usSouthReservationID = envVars.USSouthReservationID - - require.NotEmpty(t, usSouthClusterName, "Cluster ID for US South region must be provided in environment variables.") - require.NotEmpty(t, usSouthReservationID, "Reservation ID for US South region must be provided in environment variables.") - testLogger.Info(t, fmt.Sprintf("HPC-specific configuration validated for US South: Cluster ID - %s, Reservation ID - %s", usSouthClusterName, usSouthReservationID)) - } - - // Set up Terraform options. - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up Terraform options.") - - // Assign solution-specific Terraform variables. - switch envVars.Solution { - case "HPC": - options.TerraformVars["zones"] = usSouthZone - options.TerraformVars["reservation_id"] = usSouthReservationID - options.TerraformVars["cluster_name"] = usSouthClusterName - testLogger.Info(t, "Terraform variables configured for HPC in US South.") - case "lsf": - options.TerraformVars["zones"] = usSouthZone - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 2, - "instance_type": "mx2-2x16", - }, - { - "count": 0, - "instance_type": "cx2-2x4", - }, - } - testLogger.Info(t, "Terraform variables configured for LSF in US South.") - } - - // Skip automatic teardown for further inspection post-test. - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration. - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - testLogger.Info(t, "Cluster configuration validation completed successfully.") -} - -// TestRunInJPTokyoRegion validates the cluster creation process in the jp tokyo region using the m* profile. -func TestRunInJPTokyoRegion(t *testing.T) { - // Allow the test to run concurrently with others. - t.Parallel() - - // Set up the test suite environment. - setupTestSuite(t) - testLogger.Info(t, "Initiating cluster creation process for test: "+t.Name()) - - // Generate a unique prefix for the HPC cluster. - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve and validate environment variables. - envVars := GetEnvVars() - jpTokyoZone := utils.SplitAndTrim(envVars.JPTokZone, ",") - require.NotEmpty(t, jpTokyoZone, "JP Tokyo zone configuration must be provided.") - testLogger.Info(t, fmt.Sprintf("JP Tokyo zone configuration validated: %s", jpTokyoZone)) - - // Declare variables for solution-specific configurations. - var jpTokyoClusterName, jpTokyoReservationID string - - // Configure based on the solution type. - if envVars.Solution == "HPC" { - jpTokyoClusterName = envVars.JPTokClusterName - jpTokyoReservationID = envVars.JPTokReservationID - - require.NotEmpty(t, jpTokyoClusterName, "Cluster ID for JP Tokyo region must be provided in environment variables.") - require.NotEmpty(t, jpTokyoReservationID, "Reservation ID for JP Tokyo region must be provided in environment variables.") - testLogger.Info(t, fmt.Sprintf("HPC-specific configuration validated for JP Tokyo : Cluster ID - %s, Reservation ID - %s", jpTokyoClusterName, jpTokyoReservationID)) - } - - // Set up Terraform options. - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up Terraform options.") - - // Assign solution-specific Terraform variables. - switch envVars.Solution { - case "HPC": - options.TerraformVars["zones"] = jpTokyoZone - options.TerraformVars["cluster_name"] = jpTokyoClusterName - options.TerraformVars["reservation_id"] = jpTokyoReservationID - testLogger.Info(t, "Terraform variables configured for HPC in JP Tokyo.") - case "lsf": - options.TerraformVars["zones"] = jpTokyoZone - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 1, - "instance_type": "mx3d-128x1280", - }, - { - "count": 0, - "instance_type": "cx3d-24x60", - }, - } - testLogger.Info(t, "Terraform variables configured for LSF in JP Tokyo.") - } - - // Skip automatic teardown for further inspection post-test. - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration. - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) - testLogger.Info(t, "Cluster configuration validation completed successfully.") -} - -// TestRunLDAP validates cluster creation with LDAP enabled. -func TestRunLDAP(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - if strings.ToLower(envVars.EnableLdap) == "true" { - // Check if the Reservation ID contains 'WES' and cluster ID is not empty - if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { - require.FailNow(t, "LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") - } - } else { - require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") - } - - // Set Terraform variables - options.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) - options.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns - options.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword //pragma: allowlist secret - options.TerraformVars["ldap_user_name"] = envVars.LdapUserName - options.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword //pragma: allowlist secret - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateLDAPClusterConfiguration(t, options, testLogger) -} - -// TestRunUsingExistingKMS validates cluster creation using an existing KMS. -func TestRunUsingExistingKMS(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Service instance name - randomString := utils.GenerateRandomString() - kmsInstanceName := "cicd-" + randomString - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Create service instance and KMS key using IBMCloud CLI - err := lsf.CreateServiceInstanceAndKmsKey(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, lsf.KMS_KEY_NAME, testLogger) - require.NoError(t, err, "Service instance and KMS key creation failed") - - testLogger.Info(t, "Service instance and KMS key created successfully "+t.Name()) - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set Terraform variables - options.TerraformVars["key_management"] = "key_protect" - options.TerraformVars["kms_instance_name"] = kmsInstanceName - options.TerraformVars["kms_key_name"] = lsf.KMS_KEY_NAME - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - - // Ensure the service instance and KMS key are deleted after the test - defer lsf.DeleteServiceInstanceAndAssociatedKeys(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, testLogger) - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) -} - -// TestRunUsingExistingKMSInstanceIDAndWithOutKey validates cluster creation using an existing KMS. -func TestRunUsingExistingKMSInstanceIDAndWithoutKey(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Service instance name - randomString := utils.GenerateRandomString() - kmsInstanceName := "cicd-" + randomString - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Create service instance and KMS key using IBMCloud CLI - err := lsf.CreateServiceInstanceAndKmsKey(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, lsf.KMS_KEY_NAME, testLogger) - require.NoError(t, err, "Service instance and KMS key creation failed") - - testLogger.Info(t, "Service instance and KMS key created successfully "+t.Name()) - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set Terraform variables - options.TerraformVars["key_management"] = "key_protect" - options.TerraformVars["kms_instance_name"] = kmsInstanceName - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - - // Ensure the service instance and KMS key are deleted after the test - defer lsf.DeleteServiceInstanceAndAssociatedKeys(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, testLogger) - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) -} - -// TestRunLDAPAndPac validates cluster creation with both Application Center (PAC) and LDAP enabled. -func TestRunLDAPAndPac(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - if strings.ToLower(envVars.EnableLdap) == "true" { - // Check if the Reservation ID contains 'WES' and cluster ID is not empty - if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { - require.FailNow(t, "LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") - } - } else { - require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") - } - - // Set Terraform variables - options.TerraformVars["enable_app_center"] = strings.ToLower(envVars.EnableAppCenter) - options.TerraformVars["app_center_gui_pwd"] = envVars.AppCenterGuiPassword //pragma: allowlist secret - options.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) - options.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns - options.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword //pragma: allowlist secret - options.TerraformVars["ldap_user_name"] = envVars.LdapUserName - options.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword //pragma: allowlist secret - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidatePACANDLDAPClusterConfiguration(t, options, testLogger) -} - -// TestRunCreateVpc as brand new -func TestRunCreateVpc(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - testLogger.Info(t, "Brand new VPC creation initiated for "+t.Name()) - - // Define the HPC cluster prefix - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptionsVpc(t, hpcClusterPrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Run the test - output, err := options.RunTest() - require.NoError(t, err, "Error running consistency test: %v", err) - require.NotNil(t, output, "Expected non-nil output, but got nil") - - outputs := (options.LastTestTerraformOutputs) - vpcName := outputs["vpc_name"].(string) - bastionsubnetId, computesubnetIds := utils.GetSubnetIds(outputs) - - RunHpcExistingVpcCidr(t, vpcName) - RunHpcExistingVpcSubnetIdCustomNullDnsNull(t, vpcName, bastionsubnetId, computesubnetIds) -} - -// RunHpcExistingVpcCidr with Cidr blocks -func RunHpcExistingVpcCidr(t *testing.T, vpcName string) { - fmt.Println("********* Started Executing RunHpcExistingVpcCidr ********* ") - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Static values for CIDR other than default CIDR - vpcClusterPrivateSubnetsCidrBlocks := "10.241.48.0/21" - vpcClusterLoginPrivateSubnetsCidrBlocks := "10.241.60.0/22" - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - options.TerraformVars["vpc_name"] = vpcName - options.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = utils.SplitAndTrim(vpcClusterPrivateSubnetsCidrBlocks, ",") - options.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = utils.SplitAndTrim(vpcClusterLoginPrivateSubnetsCidrBlocks, ",") - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfiguration(t, options, testLogger) - fmt.Println("********* Ended Executing RunHpcExistingVpcCidr ********* ") -} - -// RunHpcExistingVpcSubnetIdCustomNullDnsNull with compute and login subnet id. Both custom_resolver and dns_instace null -func RunHpcExistingVpcSubnetIdCustomNullDnsNull(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string) { - fmt.Println("********* Started Executing RunHpcExistingVpcSubnetIdCustomNullDnsNull ********* ") - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - options.TerraformVars["vpc_name"] = vpcName - options.TerraformVars["login_subnet_id"] = bastionsubnetId - options.TerraformVars["cluster_subnet_ids"] = utils.SplitAndTrim(computesubnetIds, ",") - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfiguration(t, options, testLogger) - fmt.Println("********* Ended Executing RunHpcExistingVpcSubnetIdCustomNullDnsNull ********* ") -} - -// TestRunCreateVpcWithCustomDns brand new VPC with DNS -func TestRunVpcWithCustomDns(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Define the HPC cluster prefix - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptionsVpc(t, hpcClusterPrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) - options.TerraformVars["enable_hub"] = true - - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Run the test - output, err := options.RunTestConsistency() - require.NoError(t, err, "Error running consistency test: %v", err) - require.NotNil(t, output, "Expected non-nil output, but got nil") - - outputs := (options.LastTestTerraformOutputs) - vpcName := outputs["vpc_name"].(string) - instanceId, customResolverId := utils.GetDnsCustomResolverIds(outputs) - bastionsubnetId, computesubnetIds := utils.GetSubnetIds(outputs) - - RunHpcExistingVpcBothCustomDnsExist(t, vpcName, bastionsubnetId, computesubnetIds, instanceId, customResolverId) - RunHpcExistingVpcCustomExistDnsNull(t, vpcName, bastionsubnetId, computesubnetIds, customResolverId) - RunHpcExistingVpcCustomNullDnsExist(t, instanceId) -} - -// RunHpcExistingVpcCustomDns with existing custom_resolver_id and dns_instance_id -func RunHpcExistingVpcBothCustomDnsExist(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string, instanceId string, customResolverId string) { - fmt.Println("********* Started Executing RunHpcExistingVpcBothCustomDnsExist ********* ") - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - options.TerraformVars["vpc_name"] = vpcName - options.TerraformVars["login_subnet_id"] = bastionsubnetId - options.TerraformVars["cluster_subnet_ids"] = utils.SplitAndTrim(computesubnetIds, ",") - options.TerraformVars["dns_instance_id"] = instanceId - options.TerraformVars["dns_custom_resolver_id"] = customResolverId - - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfiguration(t, options, testLogger) - fmt.Println("********* Ended Executing RunHpcExistingVpcBothCustomDnsExist ********* ") -} - -// RunHpcExistingVpcCustomExistDnsNull with existing custom_resolver_id and new dns_instance_id -func RunHpcExistingVpcCustomExistDnsNull(t *testing.T, vpcName string, bastionsubnetId string, computesubnetIds string, customResolverId string) { - fmt.Println("********* Started Executing RunHpcExistingVpcCustomExistDnsNull ********* ") - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - options.TerraformVars["vpc_name"] = vpcName - options.TerraformVars["login_subnet_id"] = bastionsubnetId - options.TerraformVars["cluster_subnet_ids"] = utils.SplitAndTrim(computesubnetIds, ",") - options.TerraformVars["dns_custom_resolver_id"] = customResolverId - - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfiguration(t, options, testLogger) - fmt.Println("********* Ended Executing RunHpcExistingVpcCustomExistDnsNull ********* ") -} - -// RunHpcExistingVpcCustomNullDnsExist with custom_resolver_id null and existing dns_instance_id -func RunHpcExistingVpcCustomNullDnsExist(t *testing.T, instanceId string) { - fmt.Println("********* Started Executing RunHpcExistingVpcCustomNullDnsExist ********* ") - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - options.TerraformVars["dns_instance_id"] = instanceId - - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfiguration(t, options, testLogger) - fmt.Println("********* Ended Executing RunHpcExistingVpcCustomNullDnsExist ********* ") -} - -func TestRunCIDRsAsNonDefault(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - options.TerraformVars["vpc_cidr"] = "10.243.0.0/18" - options.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = []string{"10.243.0.0/20"} - options.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = []string{"10.243.16.0/28"} - - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfiguration(t, options, testLogger) -} - -// TestRunCosAndVpcFlowLogs validates cluster creation with vpc flow logs and cos enabled. -func TestRunCosAndVpcFlowLogs(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set Terraform variables - options.TerraformVars["enable_cos_integration"] = true - options.TerraformVars["enable_vpc_flow_logs"] = true - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t, options, testLogger) -} - -// TestRunMultipleSSHKeys validates the cluster configuration. -func TestRunMultipleSSHKeys(t *testing.T) { - - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateClusterConfigurationWithMultipleKeys(t, options, testLogger) - -} - -// TestRunExistingLDAP validates the creation and configuration of HPC clusters with LDAP integration, including setup, validation, and error handling for both clusters. -func TestRunExistingLDAP(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Setup the test suite - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate random prefix for HPC cluster - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables - envVars := GetEnvVars() - - // Ensure LDAP is enabled and credentials are provided - if strings.ToLower(envVars.EnableLdap) == "true" { - if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { - require.FailNow(t, "LDAP credentials are missing. Ensure LDAP admin password, LDAP user name, and LDAP user password are provided.") - } - } else { - require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") - } - - // Set up the test options with the relevant parameters, including environment variables and resource group for the first cluster - options1, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options for the first cluster: %v", err) - - // Set Terraform variables for the first cluster - options1.TerraformVars["management_node_count"] = 1 - options1.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) - options1.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns - options1.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret - options1.TerraformVars["ldap_user_name"] = envVars.LdapUserName - options1.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword // pragma: allowlist secret - options1.TerraformVars["key_management"] = "null" - options1.TerraformVars["enable_cos_integration"] = false - options1.TerraformVars["enable_vpc_flow_logs"] = false - - // Skip test teardown for further inspection - options1.SkipTestTearDown = true - defer options1.TestTearDown() - - // Run the test and validate output - output, err := options1.RunTest() - require.NoError(t, err, "Error running test: %v", err) - require.NotNil(t, output, "Expected non-nil output, but got nil") - - // Retrieve custom resolver ID - customResolverID, err := utils.GetCustomResolverID(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, hpcClusterPrefix, testLogger) - require.NoError(t, err, "Error retrieving custom resolver ID: %v", err) - - // Retrieve LDAP IP and Bastion IP - ldapIP, err := utils.GetLdapIP(t, options1, testLogger) - require.NoError(t, err, "Error retrieving LDAP IP address: %v", err) - - ldapServerBastionIP, err := utils.GetBastionIP(t, options1, testLogger) - require.NoError(t, err, "Error retrieving LDAP server bastion IP address: %v", err) - - // Update security group for LDAP - err = utils.RetrieveAndUpdateSecurityGroup(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, hpcClusterPrefix, "10.241.0.0/18", "389", "389", testLogger) - require.NoError(t, err, "Error updating security group: %v", err) - - testLogger.Info(t, "Cluster creation process for the second cluster initiated for "+t.Name()) - - // Generate random prefix for the second HPC cluster - hpcClusterPrefix2 := utils.GenerateRandomString() - - // Retrieve LDAP server certificate via SSH and assert no connection errors. - ldapServerCert, serverCertErr := lsf.GetLDAPServerCert(lsf.LSF_PUBLIC_HOST_NAME, ldapServerBastionIP, lsf.LSF_LDAP_HOST_NAME, ldapIP) - require.NoError(t, serverCertErr, "Failed to retrieve LDAP server certificate via SSH") - - testLogger.Info(t, ldapServerCert) - - // Set up the test options with the relevant parameters, including environment variables and resource group for the second cluster - options2, err := setupOptions(t, hpcClusterPrefix2, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options for the second cluster: %v", err) - - // Set Terraform variables for the second cluster - options2.TerraformVars["vpc_name"] = options1.TerraformVars["cluster_prefix"].(string) + "-lsf-vpc" - options2.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = []string{CLUSTER_TWO_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS} - options2.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = []string{CLUSTER_TWO_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS} - options2.TerraformVars["management_node_count"] = 2 - options2.TerraformVars["dns_domain_name"] = map[string]string{"compute": CLUSTER_TWO_DNS_DOMAIN_NAME} - options2.TerraformVars["dns_custom_resolver_id"] = customResolverID - options2.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) - options2.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns - options2.TerraformVars["ldap_server"] = ldapIP - options2.TerraformVars["ldap_server_cert"] = strings.TrimSpace(ldapServerCert) - - // Skip test teardown for further inspection - options2.SkipTestTearDown = true - defer options2.TestTearDown() - - // Validate LDAP configuration for the second cluster - lsf.ValidateExistingLDAPClusterConfig(t, ldapServerBastionIP, ldapIP, envVars.LdapBaseDns, envVars.LdapAdminPassword, envVars.LdapUserName, envVars.LdapUserPassword, options2, testLogger) -} - -// TestRunLSFLogs validates the cluster creation process, focusing on the following: -// Ensures cloud logs are correctly validated. -// Verifies that LSF management logs are stored in the designated directory within the shared folder. -// Checks for the presence of symbolic links to the logs. -// Confirms the cluster setup passes basic configuration and validation checks. -// Prerequisites: -// - The cluster should have at least two management nodes. -func TestRunLSFLogs(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Prevent automatic test teardown to allow for further inspection, if needed. - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Validate the basic cluster configuration and LSF management logs. - lsf.ValidateBasicClusterConfigurationLSFLogs(t, options, testLogger) -} - -// TestRunDedicatedHost validates cluster creation -func TestRunDedicatedHost(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - options.TerraformVars["enable_dedicated_host"] = true - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 1, - "instance_type": "bx2-2x8", - }, - } - - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfigurationWithDedicatedHost(t, options, true, testLogger) - -} - -// TestRunObservabilityCloudLogsManagementAndComputeEnabled validates the creation of a cluster -// with observability features enabled for both management and compute nodes. The test ensures that the -// cluster setup passes basic validation checks, confirming that the observability features for both management -// and compute are properly configured and functional, while platform logs and monitoring are disabled. -func TestRunObservabilityCloudLogsManagementAndComputeEnabled(t *testing.T) { - // Run the test in parallel with other tests to optimize test execution - t.Parallel() - - // Set up the test suite and environment configuration - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables to configure the test - envVars := GetEnvVars() - - // Set up test options with relevant parameters, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Configure the observability settings for management and compute logs, with platform logs and monitoring disabled - options.TerraformVars["observability_logs_enable_for_management"] = true - options.TerraformVars["observability_logs_enable_for_compute"] = true - options.TerraformVars["observability_enable_platform_logs"] = false - options.TerraformVars["observability_monitoring_enable"] = false - - // Prevent automatic test teardown for inspection after the test runs - options.SkipTestTearDown = true - - // Ensure test teardown is executed at the end of the test - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct with cloud logs enabled for management and compute nodes - lsf.ValidateBasicClusterConfigurationWithCloudLogs(t, options, testLogger) -} - -// TestRunObservabilityCloudLogsManagementEnabled validates the creation of a cluster -// with observability features enabled only for management nodes. This test ensures: -// Management node logs are properly configured while compute node logs are disabled. -// Platform logs are enabled for platform-level observability. -// Monitoring features are explicitly disabled. -// The cluster setup passes basic validation checks. - -func TestRunObservabilityCloudLogsManagementEnabled(t *testing.T) { - // Run the test in parallel for efficiency - t.Parallel() - - // Set up the test suite and initialize the environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, fmt.Sprintf("Cluster creation process initiated for test: %s", t.Name())) - - // Generate a unique cluster prefix for the test - clusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables required for configuration - envVars := GetEnvVars() - - // Configure test options with Terraform variables and environment settings - options, err := setupOptions(t, clusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Failed to set up test options: %v", err) - - // Configure observability settings: - options.TerraformVars["observability_logs_enable_for_management"] = true - options.TerraformVars["observability_logs_enable_for_compute"] = false - options.TerraformVars["observability_monitoring_enable"] = false - - // Check if platform logs already exist for the given region and resource group - platformLogsExist, err := lsf.CheckPlatformLogsPresent(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, testLogger) - require.NoError(t, err, "Error checking platform logs for cluster: %v", err) - - // Set platform logs configuration based on their existence in the region - if platformLogsExist { - options.TerraformVars["observability_enable_platform_logs"] = false // Reuse existing platform logs - } else { - options.TerraformVars["observability_enable_platform_logs"] = true // Enable platform logs - } - - testLogger.Info(t, fmt.Sprintf("%v", platformLogsExist)) - // Skip automatic test teardown to allow for manual inspection after the test - options.SkipTestTearDown = true - - // Ensure teardown is executed at the end of the test - defer options.TestTearDown() - - // Validate the basic cluster configuration with the specified observability settings - lsf.ValidateBasicClusterConfigurationWithCloudLogs(t, options, testLogger) -} - -// TestRunObservabilityCloudLogsManagementAndComputeDisabled validates the creation of a cluster -// with observability features disabled for both management and compute nodes. This test ensures: -// Both management and compute logs are disabled. -// Monitoring features are explicitly disabled. -// The cluster setup passes basic validation checks. -func TestRunObservabilityCloudLogsManagementAndComputeDisabled(t *testing.T) { - // Run the test in parallel for efficiency - t.Parallel() - - // Set up the test suite and initialize the environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, fmt.Sprintf("Cluster creation process initiated for test: %s", t.Name())) - - // Generate a unique cluster prefix for the test - clusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables required for configuration - envVars := GetEnvVars() - - // Configure test options with Terraform variables and environment settings - options, err := setupOptions(t, clusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoErrorf(t, err, "Failed to set up test options: %v", err) - - // Ensure options is initialized before teardown - if options == nil { - t.Fatalf("Test options initialization failed, cannot proceed.") - } - - // Defer teardown to ensure cleanup, with a nil check for safety - defer func() { - if options != nil { - options.TestTearDown() - } - }() - - // Configure observability settings: - options.TerraformVars["observability_logs_enable_for_management"] = false // Management logs disabled - options.TerraformVars["observability_logs_enable_for_compute"] = false // Compute logs disabled - options.TerraformVars["observability_monitoring_enable"] = false // Monitoring disabled - options.TerraformVars["observability_enable_platform_logs"] = false // Platform logs disabled - - // Skip automatic test teardown to allow for manual inspection after the test - options.SkipTestTearDown = true - - // Validate the basic cluster configuration with the specified observability settings - lsf.ValidateBasicClusterConfigurationWithCloudLogs(t, options, testLogger) -} - -// TestRunObservabilityMonitoringForManagementAndComputeEnabled validates the creation of a cluster -// with observability features enabled for both management and compute nodes. The test ensures that the -// cluster setup passes basic validation checks, confirming that the observability features for both management -// and compute are properly configured and functional, while platform logs and monitoring are disabled. -func TestRunObservabilityMonitoringForManagementAndComputeEnabled(t *testing.T) { - // Run the test in parallel with other tests to optimize test execution - t.Parallel() - - // Set up the test suite and environment configuration - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables to configure the test - envVars := GetEnvVars() - - // Set up test options with relevant parameters, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Configure the observability settings for management and compute logs, - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = true - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = true - options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" - - // Prevent automatic test teardown for inspection after the test runs - options.SkipTestTearDown = true - - // Ensure test teardown is executed at the end of the test - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct with cloud monitoring enabled for management and compute nodes - lsf.ValidateBasicClusterConfigurationWithCloudMonitoring(t, options, testLogger) -} - -// TestRunObservabilityMonitoringForManagementEnabledAndComputeDisabled validates the creation of a cluster -// with observability features enabled for management nodes and disabled for compute nodes. The test ensures that the -// cluster setup passes basic validation checks, confirming that the observability features for management -// and compute are properly configured and functional, while platform logs and monitoring are disabled. -func TestRunObservabilityMonitoringForManagementEnabledAndComputeDisabled(t *testing.T) { - // Run the test in parallel with other tests to optimize test execution - t.Parallel() - - // Set up the test suite and environment configuration - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables to configure the test - envVars := GetEnvVars() - - // Set up test options with relevant parameters, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Configure the observability settings for management and compute logs, - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = true - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false - options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" - - // Prevent automatic test teardown for inspection after the test runs - options.SkipTestTearDown = true - - // Ensure test teardown is executed at the end of the test - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct with cloud monitoring enabled for management nodes and disabled for compute nodes - lsf.ValidateBasicClusterConfigurationWithCloudMonitoring(t, options, testLogger) -} - -// TestRunObservabilityMonitoringForManagementAndComputeDisabled validates the creation of a cluster -// with observability features enabled for both management and compute nodes. The test ensures that the -// cluster setup passes basic validation checks, confirming that the observability features for both management -// and compute are properly configured and functional, while platform logs and monitoring are disabled. -func TestRunObservabilityMonitoringForManagementAndComputeDisabled(t *testing.T) { - // Run the test in parallel with other tests to optimize test execution - t.Parallel() - - // Set up the test suite and environment configuration - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve necessary environment variables to configure the test - envVars := GetEnvVars() - - // Set up test options with relevant parameters, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Configure the observability settings for management and compute logs, - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = true - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false - options.TerraformVars["observability_monitoring_plan"] = "graduated-tier" - - // Prevent automatic test teardown for inspection after the test runs - options.SkipTestTearDown = true - - // Ensure test teardown is executed at the end of the test - defer options.TestTearDown() - - // Validate that the basic cluster configuration is correct with cloud monitoring disabled for management and compute nodes - lsf.ValidateBasicClusterConfigurationWithCloudMonitoring(t, options, testLogger) -} - -// TestRunobservabilityAtrackerEnabledAndTargetTypeAsCloudlogs validates cluster creation -// with Observability Atracker enabled and the target type set to Cloud Logs. -func TestRunobservabilityAtrackerEnabledAndTargetTypeAsCloudlogs(t *testing.T) { - // Execute the test in parallel to improve efficiency - t.Parallel() - - // Initialize the test suite and set up the environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables required for configuration - envVars := GetEnvVars() - - // Configure test options, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set observability configurations for logs and monitoring - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = false - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false - options.TerraformVars["observability_atracker_enable"] = true - options.TerraformVars["observability_atracker_target_type"] = "cloudlogs" - - // Prevent test teardown for post-test inspection - options.SkipTestTearDown = true - - // Ensure proper cleanup after test execution - defer options.TestTearDown() - - // Validate the cluster setup with Atracker enabled and target type as cloudlogs - lsf.ValidateBasicClusterConfigurationWithCloudAtracker(t, options, testLogger) -} - -// TestRunobservabilityAtrackerEnabledAndTargetTypeAsCos validates cluster creation -// with Observability Atracker enabled and the target type set to COS. -func TestRunobservabilityAtrackerEnabledAndTargetTypeAsCos(t *testing.T) { - // Execute the test in parallel to improve efficiency - t.Parallel() - - // Initialize the test suite and set up the environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables required for configuration - envVars := GetEnvVars() - - // Configure test options, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set observability configurations for logs and monitoring - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = false - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false - options.TerraformVars["observability_atracker_enable"] = true - options.TerraformVars["observability_atracker_target_type"] = "cos" - - // Prevent test teardown for post-test inspection - options.SkipTestTearDown = true - - // Ensure proper cleanup after test execution - defer options.TestTearDown() - - // Validate the cluster setup with Atracker enabled and target type as cos - lsf.ValidateBasicClusterConfigurationWithCloudAtracker(t, options, testLogger) -} - -// TestRunobservabilityAtrackerDisabledAndTargetTypeAsCos validates cluster creation -// with Observability Atracker disabled and the target type set to COS. -func TestRunobservabilityAtrackerDisabledAndTargetTypeAsCos(t *testing.T) { - // Execute the test in parallel to improve efficiency - t.Parallel() - - // Initialize the test suite and set up the environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables required for configuration - envVars := GetEnvVars() - - // Configure test options, including resource group and environment variables - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Set observability configurations for logs and monitoring - options.TerraformVars["observability_logs_enable_for_management"] = false - options.TerraformVars["observability_monitoring_enable"] = false - options.TerraformVars["observability_monitoring_on_compute_nodes_enable"] = false - options.TerraformVars["observability_atracker_enable"] = false - options.TerraformVars["observability_atracker_target_type"] = "cos" - - // Prevent test teardown for post-test inspection - options.SkipTestTearDown = true - - // Ensure proper cleanup after test execution - defer options.TestTearDown() - - // Validate the cluster setup with Atracker disabled and target type as cos - lsf.ValidateBasicClusterConfigurationWithCloudAtracker(t, options, testLogger) -} - -// ############################## Negative Test cases ########################################## - -// TestRunHPCWithoutMandatory tests Terraform's behavior when mandatory variables are missing by checking for specific error messages. -func TestRunHPCWithoutMandatory(t *testing.T) { - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Getting absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: map[string]interface{}{ - "solution": "hpc", - }, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // If there is an error, check if it contains specific mandatory fields - if err != nil { - result := - utils.VerifyDataContains(t, err.Error(), "bastion_ssh_keys", testLogger) && - utils.VerifyDataContains(t, err.Error(), "compute_ssh_keys", testLogger) && - utils.VerifyDataContains(t, err.Error(), "remote_allowed_ips", testLogger) - // Assert that the result is true if all mandatory fields are missing - assert.True(t, result) - } else { - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on HPC without mandatory") - } - -} - -// TestRunLSFWithoutMandatory tests Terraform's behavior when mandatory variables are missing by checking for specific error messages. -func TestRunLSFWithoutMandatory(t *testing.T) { - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Getting absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: map[string]interface{}{ - "solution": "lsf", - }, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // If there is an error, check if it contains specific mandatory fields - if err != nil { - result := utils.VerifyDataContains(t, err.Error(), "bastion_ssh_keys", testLogger) && - utils.VerifyDataContains(t, err.Error(), "compute_ssh_keys", testLogger) && - utils.VerifyDataContains(t, err.Error(), "remote_allowed_ips", testLogger) - // Assert that the result is true if all mandatory fields are missing - assert.True(t, result) - } else { - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on LSF without mandatory") - } - -} - -// TestRunHPCInvalidReservationID verifies Terraform's behavior when mandatory variables are missing. -// Specifically, it checks for appropriate error messages when "reservation_id" is not set correctly. -func TestRunHPCInvalidReservationID(t *testing.T) { - // Parallelize the test for concurrent execution - t.Parallel() - - // Set up the test suite environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve required environment variables - envVars := GetEnvVars() - - // Determine the absolute path to the Terraform directory - absPath, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get the absolute path for the solutions directory") - - // Adjust the Terraform directory path to remove "tests/" if present - terraformDir := strings.ReplaceAll(absPath, "tests/", "") - - // Define Terraform options with relevant variables - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terraformDir, - Vars: map[string]interface{}{ - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "solution": "hpc", - "cluster_name": envVars.ClusterName, - }, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // Ensure an error is returned during the planning stage - assert.Error(t, err, "Expected an error during plan") - - // Validate the error message if an error occurred - if err != nil { - // Verify the error message contains expected substrings - isErrorValid := utils.VerifyDataContains(t, err.Error(), "validate_reservation_id_new_msg", testLogger) && - utils.VerifyDataContains(t, err.Error(), "The provided reservation id doesn't have a valid reservation or the reservation id is not on the same account as HPC deployment.", testLogger) - - // Assert that all required validations passed - assert.True(t, isErrorValid, "Error validation failed") - } else { - // Log failure if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur for reservation ID") - } -} - -// TestRunInvalidSubnetCIDR validates cluster creation with invalid subnet CIDR ranges. -func TestRunInvalidSubnetCIDR(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "solution": envVars.Solution, - "cluster_name": envVars.ClusterName, - "vpc_cluster_private_subnets_cidr_blocks": utils.SplitAndTrim("1.1.1.1/20", ","), - "vpc_cluster_login_private_subnets_cidr_blocks": utils.SplitAndTrim("2.2.2.2/20", ","), - "scc_enable": false, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }, - ) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Apply the Terraform configuration - _, err = terraform.InitAndApplyE(t, terraformOptions) - - // Check if an error occurred during apply - assert.Error(t, err, "Expected an error during apply") - - if err != nil { - // Check if the error message contains specific keywords indicating Subnet CIDR block issues - result := utils.VerifyDataContains(t, err.Error(), "Invalid json payload provided: Key: 'SubnetTemplateOneOf.SubnetTemplate.CIDRBlock' Error:Field validation for 'CIDRBlock' failed on the 'validcidr' tag", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid Subnet CIDR range") - } else { - testLogger.FAIL(t, "Validation failed: Invalid Subnet CIDR range") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid Subnet CIDR range") - } - - // Cleanup resources - defer terraform.Destroy(t, terraformOptions) -} - -// TestRunInvalidSshKeysAndRemoteAllowedIP validates cluster creation with invalid ssh keys and remote allowed IP. -func TestRunInvalidSshKeysAndRemoteAllowedIP(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": []string{""}, - "compute_ssh_keys": []string{""}, - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": []string{""}, - "cluster_name": envVars.ClusterName, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // Check if an error occurred during plan - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - // Check if the error message contains specific keywords indicating domain name issues - result := utils.VerifyDataContains(t, err.Error(), "The provided IP address format is not valid", testLogger) && - utils.VerifyDataContains(t, err.Error(), "No SSH Key found with name", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid ssh keys and remote allowed IP") - } else { - testLogger.FAIL(t, "Validation failed: Invalid ssh keys and remote allowed IP") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid ssh keys and remote allowed IP") - } -} - -// TestRunHPCInvalidReservationIDAndContractID tests invalid cluster_name and reservation_id values -func TestRunHPCInvalidReservationIDAndContractID(t *testing.T) { - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Define invalid cluster_name and reservation_id values - invalidClusterNames := []string{ - "too_long_cluster_name_1234567890_abcdefghijklmnopqrstuvwxyz", //pragma: allowlist secret - "invalid@cluster!id#", - "", - } - - invalidReservationIDs := []string{ - "1invalid_reservation", - "invalid_reservation@id", - "ContractIBM", - "", - } - - // Getting absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Loop over all combinations of invalid cluster_name and reservation_id values - for _, ClusterName := range invalidClusterNames { - for _, reservationID := range invalidReservationIDs { - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": ClusterName, - "reservation_id": reservationID, - "solution": "hpc", - }, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // If there is an error, check if it contains specific mandatory fields - if err != nil { - ClusterNameError := utils.VerifyDataContains(t, err.Error(), "cluster_name", testLogger) - reservationIDError := utils.VerifyDataContains(t, err.Error(), "reservation_id", testLogger) - result := ClusterNameError && reservationIDError - // Assert that the result is true if all mandatory fields are missing - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid ClusterName and ReservationID") - } else { - testLogger.FAIL(t, "Validation failed: Expected error did not contain required fields: cluster_name or reservation_id") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid ClusterName and ReservationID validation") - } - } - } -} - -// TestRunInvalidLDAPServerIP validates cluster creation with invalid LDAP server IP. -func TestRunInvalidLDAPServerIP(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - if strings.ToLower(envVars.EnableLdap) == "true" { - // Check if the LDAP credentials are provided - if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { - require.FailNow(t, "LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") - } - } else { - require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") - } - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "enable_ldap": true, - "ldap_admin_password": envVars.LdapAdminPassword, //pragma: allowlist secret - "ldap_server": "10.10.10.10", - "ldap_server_cert": "SampleTest", - "solution": envVars.Solution, - "scc_enable": false, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Apply the Terraform configuration - output, err := terraform.InitAndApplyE(t, terraformOptions) - - // Check if an error occurred during apply - assert.Error(t, err, "Expected an error during apply") - - if err != nil { - - // Check if the error message contains specific keywords indicating LDAP server IP issues - result := utils.VerifyDataContains(t, output, "The connection to the existing LDAP server 10.10.10.10 failed", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid LDAP server IP") - } else { - testLogger.FAIL(t, "Validation failed: Invalid LDAP server IP") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid LDAP Server IP") - } - - // Cleanup resources - defer terraform.Destroy(t, terraformOptions) -} - -// TestRunInvalidLDAPServerCert validates cluster creation with invalid LDAP server Cert. -func TestRunInvalidLDAPServerCert(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - if strings.ToLower(envVars.EnableLdap) == "true" { - // Check if the LDAP credentials are provided - if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { - require.FailNow(t, "LDAP credentials are missing. Make sure LDAP admin password, LDAP user name, and LDAP user password are provided.") - } - } else { - require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") - } - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "enable_ldap": true, - "ldap_admin_password": envVars.LdapAdminPassword, //pragma: allowlist secret - "ldap_server": "10.10.10.10", - "ldap_server_cert": "", - "solution": envVars.Solution, - "scc_enable": false, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // plan the Terraform configuration - _, err = terraform.InitAndPlanE(t, terraformOptions) - - // Check if an error occurred during plan - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - - // Check if the error message contains specific keywords indicating LDAP server IP issues - result := utils.VerifyDataContains(t, err.Error(), "Provide the current LDAP server certificate. This is required if 'ldap_server' is not set to 'null'; otherwise, the LDAP configuration will not succeed.", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid LDAP server Cert") - } else { - testLogger.FAIL(t, "Validation failed: Invalid LDAP server Cert") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid LDAP Server Cert") - } - - // Cleanup resources - defer terraform.Destroy(t, terraformOptions) -} - -// TestRunInvalidLDAPUsernamePassword tests invalid LDAP username and password -func TestRunInvalidLDAPUsernamePassword(t *testing.T) { - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Define invalid ldap username and password values - invalidLDAPUsername := []string{ - "usr", - "user@1234567890123456789012345678901", - "", - "user 1234", - } - - invalidLDAPPassword := []string{ - "password", - "PasswoRD123", - "password123", - "Password@", - "Password123", - "password@12345678901234567890", - } - - // Getting absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Loop over all combinations of invalid ldap username and password values - for _, username := range invalidLDAPUsername { - for _, password := range invalidLDAPPassword { //pragma: allowlist secret - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "enable_ldap": true, - "ldap_user_name": username, - "ldap_user_password": password, //pragma: allowlist secret - "ldap_admin_password": password, //pragma: allowlist secret - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // If there is an error, check if it contains specific mandatory fields - if err != nil { - usernameError := utils.VerifyDataContains(t, err.Error(), "ldap_user_name", testLogger) - userPasswordError := utils.VerifyDataContains(t, err.Error(), "ldap_usr_pwd", testLogger) - adminPasswordError := utils.VerifyDataContains(t, err.Error(), "ldap_adm_pwd", testLogger) - result := usernameError && userPasswordError && adminPasswordError - - // Assert that the result is true if all mandatory fields are missing - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid LDAP username LDAP user password ,LDAP admin password") - } else { - testLogger.FAIL(t, "Validation failed: Expected error did not contain required fields: ldap_user_name, ldap_user_password or ldap_admin_password") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not contain required fields: ldap_user_name, ldap_user_password or ldap_admin_password") - } - } - } -} - -// TestRunInvalidAPPCenterPassword tests invalid values for app center password -func TestRunInvalidAPPCenterPassword(t *testing.T) { - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - invalidAPPCenterPwd := []string{ - "pass@1234", - "Pass1234", - "Pas@12", - "", - } - - // Loop over all combinations of invalid cluster_name and reservation_id values - for _, password := range invalidAPPCenterPwd { //pragma: allowlist secret - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Getting absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "enable_app_center": true, - "app_center_gui_pwd": password, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // If there is an error, check if it contains specific mandatory fields - if err != nil { - result := utils.VerifyDataContains(t, err.Error(), "app_center_gui_pwd", testLogger) - - // Assert that the result is true if all mandatory fields are missing - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid Application Center Password") - } else { - testLogger.FAIL(t, "Validation failed: Invalid Application Center Password") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid Application Center Password") - } - } -} - -// TestRunInvalidDomainName validates cluster creation with invalid domain name. -func TestRunInvalidDomainName(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "dns_domain_name": map[string]string{"compute": "sample"}, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // Check if an error occurred during plan - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - // Check if the error message contains specific keywords indicating domain name issues - result := utils.VerifyDataContains(t, err.Error(), "The domain name provided for compute is not a fully qualified domain name", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid domain name") - } else { - testLogger.FAIL(t, "Validation failed: Invalid domain name") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Invalid domain name") - } -} - -// TestRunKMSInstanceNameAndKMSKeyNameWithInvalidValue tests the creation of KMS instances and KMS key names with invalid values -func TestRunKMSInstanceNameAndKMSKeyNameWithInvalidValue(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Service instance name - randomString := utils.GenerateRandomString() - kmsInstanceName := "cicd-" + randomString - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Create service instance and KMS key using IBMCloud CLI - err := lsf.CreateServiceInstanceAndKmsKey(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, lsf.KMS_KEY_NAME, testLogger) - require.NoError(t, err, "Failed to create service instance and KMS key") - - // Ensure the service instance and KMS key are deleted after the test - defer lsf.DeleteServiceInstanceAndAssociatedKeys(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultExistingResourceGroup, kmsInstanceName, testLogger) - - testLogger.Info(t, "Service instance and KMS key created successfully: "+t.Name()) - - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Failed to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - const ( - invalidKMSKeyName = "sample-key" - invalidKMSInstanceName = "sample-ins" - noKeyErrorMsg = "No keys with name sample-key" - noInstanceErrorMsg = "No resource instance found with name [sample-ins]" - noInstanceIDErrorMsg = "Please make sure you are passing the kms_instance_name if you are passing kms_key_name" - ) - - // Initialize the map to hold the variables - vars1 := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "kms_instance_name": kmsInstanceName, - "kms_key_name": invalidKMSKeyName, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars1["reservation_id"] = envVars.ReservationID - } - - // Test with valid instance ID and invalid key name - terraformOptionsCase1 := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars1, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptionsCase1) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptionsCase1) - - if err != nil { - result := utils.VerifyDataContains(t, err.Error(), noKeyErrorMsg, testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Valid instance ID and invalid key name") - } else { - testLogger.FAIL(t, "Validation failed: Valid instance ID and invalid key name") - } - } else { - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur with valid instance ID and invalid key name") - } - - // Initialize the map to hold the variables - vars2 := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "kms_instance_name": invalidKMSInstanceName, - "kms_key_name": lsf.KMS_KEY_NAME, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars2["reservation_id"] = envVars.ReservationID - } - - // Test with invalid instance ID and valid key name - terraformOptionsCase2 := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars2, - }) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptionsCase2) - if err != nil { - result := utils.VerifyDataContains(t, err.Error(), noInstanceErrorMsg, testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Invalid instance ID and valid key name") - } else { - testLogger.FAIL(t, "Validation failed: Invalid instance ID and valid key name") - } - } else { - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur with invalid instance ID and valid key name") - } - - // Initialize the map to hold the variables - vars3 := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "kms_key_name": lsf.KMS_KEY_NAME, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars3["reservation_id"] = envVars.ReservationID - } - - // Test without instance ID and valid key name - terraformOptionsCase3 := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars3, - }) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptionsCase3) - if err != nil { - result := utils.VerifyDataContains(t, err.Error(), noInstanceIDErrorMsg, testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Without instance ID and valid key name") - } else { - testLogger.FAIL(t, "Validation failed: Without instance ID and valid key name") - } - } else { - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur without instance ID and valid key name") - } -} - -// Verify that existing subnet_id has an input value, then there should be an entry for 'vpc_name' -func TestRunExistSubnetIDVpcNameAsNull(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group, set up test environment - options, err := setupOptionsVpc(t, hpcClusterPrefix, createVpcTerraformDir, envVars.DefaultExistingResourceGroup) - require.NoError(t, err, "Error setting up test options: %v", err) - - // Skip test teardown for further inspection - options.SkipTestTearDown = true - defer options.TestTearDown() - - // Run the test - output, err := options.RunTest() - require.NoError(t, err, "Error running consistency test: %v", err) - require.NotNil(t, output, "Expected non-nil output, but got nil") - outputs := (options.LastTestTerraformOutputs) - - bastionsubnetId, computesubnetIds := utils.GetSubnetIds(outputs) - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "cluster_subnet_ids": utils.SplitAndTrim(computesubnetIds, ","), - "login_subnet_id": bastionsubnetId, - "solution": envVars.Solution, - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Plan the Terraform deployment - _, err = terraform.PlanE(t, terraformOptions) - - // Check if an error occurred during plan - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - // Check if the error message contains specific keywords indicating vpc name issues - result := utils.VerifyDataContains(t, err.Error(), "If the cluster_subnet_ids are provided, the user should also provide the vpc_name", testLogger) && - utils.VerifyDataContains(t, err.Error(), "Provided cluster subnets should be in appropriate zone", testLogger) && - utils.VerifyDataContains(t, err.Error(), "Provided login subnet should be within the vpc entered", testLogger) && - utils.VerifyDataContains(t, err.Error(), "Provided login subnet should be in appropriate zone", testLogger) && - utils.VerifyDataContains(t, err.Error(), "Provided cluster subnets should be within the vpc entered", testLogger) && - utils.VerifyDataContains(t, err.Error(), "Provided existing cluster_subnet_ids should have public gateway attached", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded: Without VPC name and with valid cluster_subnet_ids and login_subnet_id") - } else { - testLogger.FAIL(t, "Validation failed: Without VPC name and with valid cluster_subnet_ids and login_subnet_id") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected error did not occur on Without VPC name and with valid cluster_subnet_ids and login_subnet_id") - } -} - -// TestRunInvalidDedicatedHostConfigurationWithZeroWorkerNodes validates the behavior of cluster creation -// when a dedicated host is enabled but the worker node count is set to zero. -func TestRunInvalidDedicatedHostConfigurationWithZeroWorkerNodes(t *testing.T) { - // Parallelize the test - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateRandomString() - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Set up the test options with the relevant parameters, including environment variables and resource group - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - require.NoError(t, err, "Error setting up test options: %v", err) - options.TerraformVars["enable_dedicated_host"] = true - options.TerraformVars["worker_node_instance_type"] = []map[string]interface{}{ - { - "count": 0, - "instance_type": "bx2-2x8", - }, - } - - options.SkipTestTearDown = true - defer options.TestTearDown() - - lsf.ValidateBasicClusterConfigurationWithDedicatedHost(t, options, false, testLogger) - -} - -// TestRunInvalidDedicatedHostProfile validates cluster creation with an invalid instance profile. -func TestRunInvalidDedicatedHostProfile(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "solution": envVars.Solution, - "scc_enable": false, - "enable_dedicated_host": true, - "worker_node_instance_type": []map[string]interface{}{ // Invalid data - { - "count": 1, - "instance_type": "cx2-2x4", - }, - { - "count": 1, - "instance_type": "bx2-2x8", - }, - }, - "observability_monitoring_enable": false, - "enable_cos_integration": false, - "enable_vpc_flow_logs": false, - "key_management": "null", - } - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "HPC" { - // specific to HPC - vars["reservation_id"] = envVars.ReservationID - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Apply the Terraform configuration - _, err = terraform.InitAndPlanE(t, terraformOptions) - - // Check if an error occurred during apply - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - errMsg := err.Error() - // Check if the error message contains specific keywords - containsWorkerNodeType := utils.VerifyDataContains(t, errMsg, "is list of object with 2 elements", testLogger) - containsDedicatedHost := utils.VerifyDataContains(t, errMsg, "'enable_dedicated_host' is true, only one profile should be specified", testLogger) - - result := containsWorkerNodeType && containsDedicatedHost - assert.True(t, result) - - if result { - testLogger.PASS(t, "Validation succeeded for invalid worker_node_instance_type object elements.") - } else { - testLogger.FAIL(t, fmt.Sprintf("Validation failed: expected error conditions not met. Actual error: %s", errMsg)) - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected error did not occur") - testLogger.FAIL(t, "Expected validation error did not occur for Invalid Dedicated-Host instance profile.") - - } - -} - -// TestRunInvalidMinWorkerNodeCountGreaterThanMax cluster creation with an invalid worker node count. -func TestRunInvalidMinWorkerNodeCountGreaterThanMax(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Set up the test suite and prepare the testing environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Generate a random prefix for the cluster to ensure uniqueness - hpcClusterPrefix := utils.GenerateTimestampedClusterPrefix(utils.GenerateRandomString()) - - // Retrieve necessary environment variables for the test - envVars := GetEnvVars() - - // You can add conditional logic here to modify the map, for example: - if envVars.Solution == "lsf" { - - // Get the absolute path of solutions/hpc - abs, err := filepath.Abs("solutions/hpc") - require.NoError(t, err, "Unable to get absolute path") - - terrPath := strings.ReplaceAll(abs, "tests/", "") - - // Initialize the map to hold the variables - vars := map[string]interface{}{ - "cluster_prefix": hpcClusterPrefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "worker_node_max_count": 2, //invalid - "worker_node_instance_type": []map[string]interface{}{ // Invalid data - { - "count": 2, - "instance_type": "bx2-2x8", - }, - { - "count": 1, - "instance_type": "cx2-2x4", - }, - }, - "solution": envVars.Solution, - "scc_enable": false, - "observability_monitoring_enable": false, - "enable_cos_integration": false, - "enable_vpc_flow_logs": false, - "key_management": "null", - } - - // Define Terraform options - terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ - TerraformDir: terrPath, - Vars: vars, - }) - - // Perform Terraform upgrade only once - UpgradeTerraformOnce(t, terraformOptions) - - // Apply the Terraform configuration - _, err = terraform.InitAndPlanE(t, terraformOptions) - - // Check if an error occurred during plan - assert.Error(t, err, "Expected an error during plan") - - if err != nil { - - // Check if the error message contains specific keywords indicating LDAP server IP issues - result := utils.VerifyDataContains(t, err.Error(), "If the solution is set as lsf, the worker min count cannot be greater than worker max count.", testLogger) - assert.True(t, result) - if result { - testLogger.PASS(t, "Validation succeeded for the worker node count") - } else { - testLogger.FAIL(t, "Validation failed for the worker node count") - } - } else { - // Log an error if the expected error did not occur - t.Error("Expected validation error did not occur.") - testLogger.FAIL(t, "Expected validation error did not occur for Invalid worker node count") - } - // Cleanup resources - defer terraform.Destroy(t, terraformOptions) - } - testLogger.Info(t, "TestRunInvalidMinWorkerNodeCountGreaterThanMax will execute If the solution is set as lsf") -} - -// ############################## Existing Environment Test Cases ############################### -// TestRunExistingPACEnvironment test the validation of an existing PAC environment configuration. -func TestRunExistingPACEnvironment(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Setup the test suite environment - setupTestSuite(t) - - // Log the initiation of cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve the environment variable for the JSON file path - val, ok := os.LookupEnv("EXISTING_ENV_JSON_FILE_PATH") - if !ok { - t.Fatal("Environment variable 'EXISTING_ENV_JSON_FILE_PATH' is not set") - } - - // Check if the JSON file exists - if _, err := os.Stat(val); os.IsNotExist(err) { - t.Fatalf("JSON file '%s' does not exist", val) - } - - // Parse the JSON configuration file - config, err := utils.ParseConfig(val) - require.NoError(t, err, "Error parsing JSON configuration: %v", err) - - // Validate the cluster configuration - lsf.ValidateClusterConfigWithAPPCenterOnExistingEnvironment( - t, config.ComputeSshKeysList, config.BastionIP, config.LoginNodeIP, config.ClusterName, config.ReservationID, - config.ClusterPrefixName, config.ResourceGroup, config.KeyManagement, - config.Zones, config.DnsDomainName, config.ManagementNodeIPList, - config.IsHyperthreadingEnabled, testLogger) -} - -// TestRunExistingPACAndLDAPEnvironment test the validation of an existing PAC and LDAP environment configuration. -func TestRunExistingPACAndLDAPEnvironment(t *testing.T) { - // Parallelize the test to run concurrently with others - t.Parallel() - - // Setup the test suite environment - setupTestSuite(t) - - // Log the initiation of the cluster creation process - testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) - - // Retrieve the environment variable for the JSON file path - val, ok := os.LookupEnv("EXISTING_ENV_JSON_FILE_PATH") - if !ok { - t.Fatal("Environment variable 'EXISTING_ENV_JSON_FILE_PATH' is not set") - } - - // Check if the JSON file exists - if _, err := os.Stat(val); os.IsNotExist(err) { - t.Fatalf("JSON file '%s' does not exist", val) - } - - // Parse the JSON configuration file - config, err := utils.ParseConfig(val) - require.NoError(t, err, "Error parsing JSON configuration: %v", err) - - // Validate the cluster configuration - lsf.ValidateClusterConfigWithAPPCenterAndLDAPOnExistingEnvironment( - t, config.ComputeSshKeysList, config.BastionIP, config.LoginNodeIP, config.ClusterName, config.ReservationID, - config.ClusterPrefixName, config.ResourceGroup, config.KeyManagement, config.Zones, config.DnsDomainName, - config.ManagementNodeIPList, config.IsHyperthreadingEnabled, config.LdapServerIP, config.LdapDomain, - config.LdapAdminPassword, config.LdapUserName, config.LdapUserPassword, testLogger) - -} diff --git a/tests/pr_test.go b/tests/pr_test.go index e657c03c..e1f1780e 100644 --- a/tests/pr_test.go +++ b/tests/pr_test.go @@ -1,463 +1,65 @@ package tests import ( - "fmt" "log" "os" "path/filepath" - "reflect" - "strings" - "sync" "testing" - "time" - "github.com/gruntwork-io/terratest/modules/terraform" "github.com/stretchr/testify/require" - "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" - + deploy "github.com/terraform-ibm-modules/terraform-ibm-hpc/deployment" + lsf_tests "github.com/terraform-ibm-modules/terraform-ibm-hpc/lsf_tests" utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/utilities" ) -// Constants for better organization -const ( - // Path of the Terraform directory - terraformDir = "solutions/hpc" -) - -var ignoreDestroys = []string{ - "module.landing_zone_vsi.module.hpc.module.check_cluster_status.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[1]", - "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[2]", - "module.check_node_status.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]", - "module.check_node_status.null_resource.remote_exec[1]", - "module.landing_zone_vsi.module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", - "module.check_cluster_status.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]", - "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[1]", - "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_new_file[0]", - "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[0]", - "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_new_file[0]", - "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_run[0]", - "module.landing_zone_vsi[0].module.wait_management_vsi_booted.null_resource.remote_exec[0]", - "module.landing_zone_vsi[0].module.lsf_entitlement[0].null_resource.remote_exec[0]", - "module.landing_zone_vsi[0].module.wait_management_candidate_vsi_booted.null_resource.remote_exec[1]", - "module.landing_zone_vsi[0].module.wait_management_candidate_vsi_booted.null_resource.remote_exec[0]", - "module.landing_zone_vsi[0].module.wait_worker_vsi_booted[0].null_resource.remote_exec[0]", - "module.check_node_status.null_resource.remote_exec[2]", - "module.landing_zone_vsi[0].module.wait_worker_vsi_booted[0].null_resource.remote_exec[1]", - "module.landing_zone_vsi[0].module.do_management_vsi_configuration.null_resource.remote_exec_script_run[0]", -} - -var ignoreUpdates = []string{ - "module.file_storage.ibm_is_share.share[0]", - "module.file_storage.ibm_is_share.share[1]", - "module.file_storage.ibm_is_share.share[2]", - "module.file_storage.ibm_is_share.share[3]", - "module.file_storage.ibm_is_share.share[4]", -} - -// EnvVars stores environment variable values. -type EnvVars struct { - DefaultExistingResourceGroup string - NonDefaultExistingResourceGroup string - Zone string - ClusterName string - ReservationID string - RemoteAllowedIPs string - SSHKey string - LoginNodeInstanceType string - LoginNodeImageName string - ManagementImageName string - ComputeImageName string - ManagementNodeInstanceType string - ManagementNodeCount string - KeyManagement string - KMSInstanceName string - KMSKeyName string - HyperthreadingEnabled string - DnsDomainName string - EnableAppCenter string - AppCenterGuiPassword string - EnableLdap string - LdapBaseDns string - LdapServer string - LdapAdminPassword string - LdapUserName string - LdapUserPassword string - USEastZone string - USEastReservationID string - USEastClusterName string - EUDEZone string - EUDEReservationID string - EUDEClusterName string - SSHFilePath string - USSouthZone string - USSouthReservationID string - USSouthClusterName string - JPTokZone string - JPTokReservationID string - JPTokClusterName string - WorkerNodeMaxCount string - WorkerNodeInstanceType string - Solution string - sccEnabled string - sccEventNotificationPlan string - sccLocation string - observabilityMonitoringEnable string - observabilityMonitoringOnComputeNodesEnable string -} - -// GetEnvVars retrieves environment variables. -func GetEnvVars() EnvVars { - return EnvVars{ - DefaultExistingResourceGroup: os.Getenv("DEFAULT_EXISTING_RESOURCE_GROUP"), - NonDefaultExistingResourceGroup: os.Getenv("NON_DEFAULT_EXISTING_RESOURCE_GROUP"), - Zone: os.Getenv("ZONE"), - ClusterName: os.Getenv("CLUSTER_NAME"), - ReservationID: os.Getenv("RESERVATION_ID"), - RemoteAllowedIPs: os.Getenv("REMOTE_ALLOWED_IPS"), - SSHKey: os.Getenv("SSH_KEY"), - LoginNodeInstanceType: os.Getenv("LOGIN_NODE_INSTANCE_TYPE"), - LoginNodeImageName: os.Getenv("LOGIN_NODE_IMAGE_NAME"), - ManagementImageName: os.Getenv("MANAGEMENT_IMAGE_NAME"), - ComputeImageName: os.Getenv("COMPUTE_IMAGE_NAME"), - ManagementNodeInstanceType: os.Getenv("MANAGEMENT_NODE_INSTANCE_TYPE"), - ManagementNodeCount: os.Getenv("MANAGEMENT_NODE_COUNT"), - KeyManagement: os.Getenv("KEY_MANAGEMENT"), - KMSInstanceName: os.Getenv("KMS_INSTANCE_NAME"), - KMSKeyName: os.Getenv("KMS_KEY_NAME"), - HyperthreadingEnabled: os.Getenv("HYPERTHREADING_ENABLED"), - DnsDomainName: os.Getenv("DNS_DOMAIN_NAME"), - EnableAppCenter: os.Getenv("ENABLE_APP_CENTER"), - AppCenterGuiPassword: os.Getenv("APP_CENTER_GUI_PASSWORD"), - EnableLdap: os.Getenv("ENABLE_LDAP"), - LdapBaseDns: os.Getenv("LDAP_BASEDNS"), - LdapServer: os.Getenv("LDAP_SERVER"), - LdapAdminPassword: os.Getenv("LDAP_ADMIN_PASSWORD"), - LdapUserName: os.Getenv("LDAP_USER_NAME"), - LdapUserPassword: os.Getenv("LDAP_USER_PASSWORD"), - USEastZone: os.Getenv("US_EAST_ZONE"), - USEastReservationID: os.Getenv("US_EAST_RESERVATION_ID"), - USEastClusterName: os.Getenv("US_EAST_CLUSTER_NAME"), - EUDEZone: os.Getenv("EU_DE_ZONE"), - EUDEReservationID: os.Getenv("EU_DE_RESERVATION_ID"), - EUDEClusterName: os.Getenv("EU_DE_CLUSTER_NAME"), - USSouthZone: os.Getenv("US_SOUTH_ZONE"), - USSouthReservationID: os.Getenv("US_SOUTH_RESERVATION_ID"), - USSouthClusterName: os.Getenv("US_SOUTH_CLUSTER_NAME"), - JPTokZone: os.Getenv("JP_TOK_ZONE"), - JPTokReservationID: os.Getenv("JP_TOK_RESERVATION_ID"), - JPTokClusterName: os.Getenv("JP_TOK_CLUSTER_NAME"), - SSHFilePath: os.Getenv("SSH_FILE_PATH"), - WorkerNodeMaxCount: os.Getenv("WORKER_NODE_MAX_COUNT"), //LSF specific parameter - WorkerNodeInstanceType: os.Getenv("WORKER_NODE_INSTANCE_TYPE"), //LSF specific parameter - Solution: os.Getenv("SOLUTION"), - sccEnabled: os.Getenv("SCC_ENABLED"), - sccEventNotificationPlan: os.Getenv("SCC_EVENT_NOTIFICATION_PLAN"), - sccLocation: os.Getenv("SCC_LOCATION"), - observabilityMonitoringEnable: os.Getenv("OBSERVABILITY_MONITORING_ENABLE"), - observabilityMonitoringOnComputeNodesEnable: os.Getenv("OBSERVABILITY_MONITORING_ON_COMPUTE_NODES_ENABLE"), - } -} - -var ( - // testLogger stores the logger instance for logging test messages. - testLogger *utils.AggregatedLogger - // loggerErr stores the error occurred during logger initialization. - loggerErr error - // testSuiteInitialized indicates whether the test suite has been initialized. - testSuiteInitialized bool -) - -// setupTestSuite initializes the test suite. -func setupTestSuite(t *testing.T) { - if !testSuiteInitialized { - fmt.Println("Started executing the test suite...") - timestamp := time.Now().Format("2006-01-02_15-04-05") - - var logFileName string - if validationLogFilePrefix, ok := os.LookupEnv("LOG_FILE_NAME"); ok { - fileName := strings.Split(validationLogFilePrefix, ".json")[0] - logFileName = fmt.Sprintf("%s.log", fileName) - } else { - logFileName = fmt.Sprintf("%s.log", timestamp) - } - - // Ensure log file name is available in environment variables - _ = os.Setenv("LOG_FILE_NAME", fmt.Sprintf("%s.json", strings.Split(logFileName, ".")[0])) - - testLogger, loggerErr = utils.NewAggregatedLogger(logFileName) - if loggerErr != nil { - t.Fatalf("Error initializing logger: %v", loggerErr) - } - testLogger.Info(t, "Logger initialized successfully") - testSuiteInitialized = true - } - -} - -var upgradeOnce sync.Once // Ensures upgrade is performed only once - -func UpgradeTerraformOnce(t *testing.T, terraformOptions *terraform.Options) { - upgradeOnce.Do(func() { - testLogger.Info(t, "Running Terraform upgrade with `-upgrade=true`...") - - // Run terraform upgrade command - output, err := terraform.RunTerraformCommandE(t, terraformOptions, "init", "-upgrade=true") - require.NoError(t, err, "Terraform upgrade failed") - - // Log the Terraform upgrade output in case of any failures - testLogger.FAIL(t, fmt.Sprintf("Terraform upgrade output:\n%s", output)) - }) -} - -// validateEnvVars validates required environment variables based on the solution type. -func validateEnvVars(solution string, envVars EnvVars) error { - var requiredVars []string - - // Determine required variables based on the solution type - if strings.Contains(solution, "hpc") { - requiredVars = []string{"SSHKey", "ClusterName", "Zone", "ReservationID"} - } else if strings.Contains(solution, "lsf") { - requiredVars = []string{"SSHKey", "ClusterName", "Zone"} - } else { - return fmt.Errorf("invalid solution type: %s", solution) - } - - // Validate if the required variables are set - for _, fieldName := range requiredVars { - if fieldValue := reflect.ValueOf(envVars).FieldByName(fieldName).String(); fieldValue == "" { - return fmt.Errorf("missing required environment variable: %s", fieldName) - } - } - return nil -} - -// setupOptionsVpc creates a test options object with the given parameters to creating brand new vpc -func setupOptionsVpc(t *testing.T, hpcClusterPrefix, terraformDir, existingResourceGroup string) (*testhelper.TestOptions, error) { - - // Check if TF_VAR_ibmcloud_api_key is set - if os.Getenv("TF_VAR_ibmcloud_api_key") == "" { - return nil, fmt.Errorf("TF_VAR_ibmcloud_api_key is not set") - } - - // Retrieve environment variables - envVars := GetEnvVars() - - // Validate required environment variables - requiredVars := []string{"SSHKey", "Zone"} - for _, fieldName := range requiredVars { - // Check if the field value is empty - if fieldValue := reflect.ValueOf(envVars).FieldByName(fieldName).String(); fieldValue == "" { - return nil, fmt.Errorf("missing required environment variable: %s", fieldName) - } - } - - // Generate timestamped cluster prefix - prefix := utils.GenerateTimestampedClusterPrefix(hpcClusterPrefix) - - // Create test options - options := &testhelper.TestOptions{ - Testing: t, - TerraformDir: terraformDir, - IgnoreDestroys: testhelper.Exemptions{List: ignoreDestroys}, - TerraformVars: map[string]interface{}{ - "cluster_prefix": prefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "existing_resource_group": existingResourceGroup, - }, - } - return options, nil -} - -// setupOptions creates a test options object with the given parameters. -func setupOptions(t *testing.T, hpcClusterPrefix, terraformDir, existingResourceGroup string, ignoreDestroys []string, ignoreUpdates []string) (*testhelper.TestOptions, error) { - - // Check if TF_VAR_ibmcloud_api_key is set - if os.Getenv("TF_VAR_ibmcloud_api_key") == "" { - return nil, fmt.Errorf("TF_VAR_ibmcloud_api_key is not set") - } - - // Lookup environment variable for solution type - solution, ok := os.LookupEnv("SOLUTION") - if !ok || solution == "" { - return nil, fmt.Errorf("SOLUTION environment variable not set") - } - - // Convert solution to lowercase for consistency - solution = strings.ToLower(solution) - - // Retrieve environment variables - envVars := GetEnvVars() - - // Validate environment variables based on solution type - if err := validateEnvVars(solution, envVars); err != nil { - return nil, err - } - - // Generate timestamped cluster prefix - prefix := utils.GenerateTimestampedClusterPrefix(hpcClusterPrefix) - - // Create test options - options := &testhelper.TestOptions{ - Testing: t, - TerraformDir: terraformDir, - IgnoreDestroys: testhelper.Exemptions{List: ignoreDestroys}, - IgnoreUpdates: testhelper.Exemptions{List: ignoreUpdates}, - TerraformVars: map[string]interface{}{ - "cluster_prefix": prefix, - "bastion_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "compute_ssh_keys": utils.SplitAndTrim(envVars.SSHKey, ","), - "zones": utils.SplitAndTrim(envVars.Zone, ","), - "remote_allowed_ips": utils.SplitAndTrim(envVars.RemoteAllowedIPs, ","), - "cluster_name": envVars.ClusterName, - "reservation_id": envVars.ReservationID, - "existing_resource_group": existingResourceGroup, - "login_node_instance_type": envVars.LoginNodeInstanceType, - "login_image_name": envVars.LoginNodeImageName, - "management_image_name": envVars.ManagementImageName, - "management_node_instance_type": envVars.ManagementNodeInstanceType, - "management_node_count": envVars.ManagementNodeCount, - "compute_image_name": envVars.ComputeImageName, - "key_management": envVars.KeyManagement, - "hyperthreading_enabled": strings.ToLower(envVars.HyperthreadingEnabled), - "app_center_high_availability": false, - "observability_atracker_enable": false, - "dns_domain_name": map[string]string{"compute": envVars.DnsDomainName}, - "worker_node_max_count": envVars.WorkerNodeMaxCount, //LSF specific parameter - "worker_node_instance_type": envVars.WorkerNodeInstanceType, //LSF specific parameter - "solution": envVars.Solution, - "scc_enable": false, - }, - } - - // Remove optional parameters based on solution type - if solution == "hpc" { - delete(options.TerraformVars, "worker_node_max_count") - delete(options.TerraformVars, "worker_node_instance_type") - - } +func TestRunDefault(t *testing.T) { + t.Parallel() - // Remove any variables with empty values - for key, value := range options.TerraformVars { - if value == "" { - delete(options.TerraformVars, key) - } - } + require.NoError(t, os.Setenv("ZONES", "us-east-3"), "Failed to set ZONES env variable") + require.NoError(t, os.Setenv("DEFAULT_EXISTING_RESOURCE_GROUP", "Default"), "Failed to set DEFAULT_EXISTING_RESOURCE_GROUP") - return options, nil + t.Log("Running default LSF cluster test for region us-east-3") + lsf_tests.DefaultTest(t) } +// TestMain is the entry point for all tests func TestMain(m *testing.M) { - var solution string - - // Lookup environment variable - if envSolution, ok := os.LookupEnv("SOLUTION"); ok { - solution = envSolution - } else { - // Set default value if SOLUTION is not set - solution = "lsf" - _ = os.Setenv("SOLUTION", solution) - log.Printf("SOLUTION environment variable is not set. Setting default value to: LSF") - } - - // Convert the product name to lowercase and determine the config file - solution = strings.ToLower(solution) - var productFileName string - switch solution { - case "hpc": - productFileName = "hpc_config.yml" - case "lsf": - productFileName = "lsf_config.yml" - default: - log.Fatalf("Invalid solution specified: %s", solution) + // Load LSF version configuration + productFileName, err := lsf_tests.GetLSFVersionConfig() + if err != nil { + log.Fatalf("❌ Failed to get LSF version config: %v", err) } - // Get the absolute path of the configuration file - absPath, err := filepath.Abs(productFileName) - if err != nil || absPath == "" { - log.Fatalf("error getting absolute path for file %s: %v", productFileName, err) + // Load and validate configuration + configFilePath, err := filepath.Abs("data/" + productFileName) + if err != nil { + log.Fatalf("❌ Failed to resolve config path: %v", err) } - // Check if the configuration file exists - if _, err := os.Stat(absPath); os.IsNotExist(err) { - log.Fatalf("Configuration file not found: %s", absPath) + if _, err := os.Stat(configFilePath); err != nil { + log.Fatalf("❌ Config file not accessible: %v", err) } - // Load configuration from the YAML file - config, err := utils.GetConfigFromYAML(absPath) - if err != nil { - log.Fatalf("Error reading configuration from YAML: %v", err) + if _, err := deploy.GetConfigFromYAML(configFilePath); err != nil { + log.Fatalf("❌ Config load failed: %v", err) } - log.Printf("Successfully loaded configuration: %+v", config) + log.Printf("✅ Configuration loaded successfully from %s", filepath.Base(configFilePath)) // Execute tests exitCode := m.Run() - // Generate report if the JSON log file is set + // Generate HTML report if JSON log exists if jsonFileName, ok := os.LookupEnv("LOG_FILE_NAME"); ok { if _, err := os.Stat(jsonFileName); err == nil { results, err := utils.ParseJSONFile(jsonFileName) - if err == nil { - // Call the GenerateHTMLReport function and handle its return value - err := utils.GenerateHTMLReport(results) - if err != nil { - // Log the error and take appropriate action - log.Printf("Error generating HTML report: %v", err) - } - - } else { - log.Printf("Error generating HTML report: %v", err) + if err != nil { + log.Printf("Failed to parse JSON results: %v", err) + } else if err := utils.GenerateHTMLReport(results); err != nil { + log.Printf("Failed to generate HTML report: %v", err) } - } else { - log.Printf("JSON log file not found: %s", jsonFileName) } } - // Exit with the test result code os.Exit(exitCode) } - -// TestRunDefault creates a basic HPC cluster and verifies its setup. -func TestRunDefault(t *testing.T) { - // Run tests in parallel - t.Parallel() - - // Initialize test suite - setupTestSuite(t) - - // Log initiation of cluster creation - testLogger.Info(t, "Initiating cluster creation for "+t.Name()) - - // Generate a unique prefix for the HPC cluster - hpcClusterPrefix := utils.GenerateRandomString() - - // Retrieve environment variables for the test - envVars := GetEnvVars() - - // Prepare test options with necessary parameters - options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultExistingResourceGroup, ignoreDestroys, ignoreUpdates) - if err != nil { - testLogger.FAIL(t, fmt.Sprintf("Failed to set up test options: %v", err)) - require.NoError(t, err, "Failed to set up test options: %v", err) - } - - // Run consistency test and handle potential errors - output, err := options.RunTest() - if err != nil { - testLogger.FAIL(t, fmt.Sprintf("Error running consistency test: %v", err)) - require.NoError(t, err, "Error running consistency test: %v", err) - } - - // Ensure that output is not nil - require.NotNil(t, output, "Expected non-nil output, but got nil") - - // Log success if no errors occurred - testLogger.PASS(t, "Test passed successfully") -} diff --git a/tests/utilities/deployment.go b/tests/utilities/deployment.go deleted file mode 100644 index 940808af..00000000 --- a/tests/utilities/deployment.go +++ /dev/null @@ -1,299 +0,0 @@ -package tests - -import ( - "encoding/json" - "fmt" - "os" - "strings" - - "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/common" - "gopkg.in/yaml.v3" -) - -var ( - ip string - reservationIDSouth string - reservationIDEast string -) - -// Define a struct with fields that match the structure of the YAML data -const yamlLocation = "../common-dev-assets/common-go-assets/common-permanent-resources.yaml" - -// WorkerNode represents the structure of each worker node instance type. -type WorkerNode struct { - Count int `yaml:"count"` - InstanceType string `yaml:"instance_type"` -} - -// Config represents the structure of the configuration file. -type Config struct { - DefaultExistingResourceGroup string `yaml:"default_existing_resource_group"` - NonDefaultExistingResourceGroup string `yaml:"non_default_existing_resource_group"` - Zone string `yaml:"zone"` - ClusterName string `yaml:"cluster_name"` - ReservationID string `yaml:"reservation_id"` - RemoteAllowedIPs string `yaml:"remote_allowed_ips"` - SSHKey string `yaml:"ssh_key"` - LoginNodeInstanceType string `yaml:"login_node_instance_type"` - LoginNodeImageName string `yaml:"login_image_name"` - ManagementImageName string `yaml:"management_image_name"` - ComputeImageName string `yaml:"compute_image_name"` - ManagementNodeInstanceType string `yaml:"management_node_instance_type"` - ManagementNodeCount int `yaml:"management_node_count"` - EnableVPCFlowLogs bool `yaml:"enable_vpc_flow_logs"` - KeyManagement string `yaml:"key_management"` - KMSInstanceName string `yaml:"kms_instance_name"` - KMSKeyName string `yaml:"kms_key_name"` - HyperthreadingEnabled bool `yaml:"hyperthreading_enabled"` - DnsDomainName string `yaml:"dns_domain_name"` - EnableAppCenter bool `yaml:"enable_app_center"` - AppCenterGuiPassword string `yaml:"app_center_gui_pwd"` // pragma: allowlist secret - EnableLdap bool `yaml:"enable_ldap"` - LdapBaseDns string `yaml:"ldap_basedns"` - LdapServer string `yaml:"ldap_server"` - LdapAdminPassword string `yaml:"ldap_admin_password"` // pragma: allowlist secret - LdapUserName string `yaml:"ldap_user_name"` - LdapUserPassword string `yaml:"ldap_user_password"` // pragma: allowlist secret - USEastZone string `yaml:"us_east_zone"` - USEastClusterName string `yaml:"us_east_cluster_name"` - USEastReservationID string `yaml:"us_east_reservation_id"` - JPTokZone string `yaml:"jp_tok_zone"` - JPTokClusterName string `yaml:"jp_tok_cluster_name"` - JPTokReservationID string `yaml:"jp_tok_reservation_id"` - EUDEZone string `yaml:"eu_de_zone"` - EUDEClusterName string `yaml:"eu_de_cluster_name"` - EUDEReservationID string `yaml:"eu_de_reservation_id"` - USSouthZone string `yaml:"us_south_zone"` - USSouthClusterName string `yaml:"us_south_cluster_name"` - USSouthReservationID string `yaml:"us_south_reservation_id"` - SSHFilePath string `yaml:"ssh_file_path"` - SSHFilePathTwo string `yaml:"ssh_file_path_two"` - Solution string `yaml:"solution"` - WorkerNodeMaxCount int `yaml:"worker_node_max_count"` - WorkerNodeInstanceType []WorkerNode `yaml:"worker_node_instance_type"` - SccEnabled bool `yaml:"scc_enable"` - SccEventNotificationPlan string `yaml:"scc_event_notification_plan"` - SccLocation string `yaml:"scc_location"` - ObservabilityMonitoringEnable bool `yaml:"observability_monitoring_enable"` - ObservabilityMonitoringOnComputeNodesEnable bool `yaml:"observability_monitoring_on_compute_nodes_enable"` -} - -// GetConfigFromYAML reads configuration from a YAML file and sets environment variables based on the configuration. -// It returns a Config struct populated with the configuration values. -func GetConfigFromYAML(filePath string) (*Config, error) { - var config Config - - // Open the YAML file - file, err := os.Open(filePath) - if err != nil { - return nil, fmt.Errorf("failed to open YAML file %s: %v", filePath, err) - } - - defer func() { - if err := file.Close(); err != nil { - fmt.Printf("warning: failed to close file: %v\n", err) - } - }() - - // Decode the YAML file into the config struct - if err := yaml.NewDecoder(file).Decode(&config); err != nil { - return nil, fmt.Errorf("failed to decode YAML: %v", err) - } - - // Get the public IP - ip, err = GetPublicIP() - if err != nil { - return nil, fmt.Errorf("failed to get public IP: %v", err) - } - - // Load permanent resources from YAML - permanentResources, err := common.LoadMapFromYaml(yamlLocation) - if err != nil { - return nil, fmt.Errorf("failed to load permanent resources from YAML: %v", err) - } - - // Retrieve reservation ID from Secret Manager // pragma: allowlist secret - reservationIDVal, ok := permanentResources["reservation_id_secret_id"].(string) - if !ok { - fmt.Println("Invalid type or nil value for reservation_id_secret_id") - } - - reservationIDEastPtr, err := GetSecretsManagerKey( - permanentResources["secretsManagerGuid"].(string), - permanentResources["secretsManagerRegion"].(string), - reservationIDVal, // Pass safely extracted value - ) - - if err != nil { - fmt.Printf("Error retrieving reservation ID from secrets: %v\n", err) // pragma: allowlist secret - } else if reservationIDEastPtr != nil { - reservationIDEast = *reservationIDEastPtr - } - - // Set environment variables from config - if err := setEnvFromConfig(&config); err != nil { - return nil, fmt.Errorf("failed to set environment variables: %v", err) - } - return &config, nil -} - -// setEnvFromConfig sets environment variables based on the provided configuration. -func setEnvFromConfig(config *Config) error { - envVars := map[string]interface{}{ - "DEFAULT_EXISTING_RESOURCE_GROUP": config.DefaultExistingResourceGroup, - "NON_DEFAULT_EXISTING_RESOURCE_GROUP": config.NonDefaultExistingResourceGroup, - "ZONE": config.Zone, - "CLUSTER_NAME": config.ClusterName, - "RESERVATION_ID": config.ReservationID, - "REMOTE_ALLOWED_IPS": config.RemoteAllowedIPs, - "SSH_KEY": config.SSHKey, - "LOGIN_NODE_INSTANCE_TYPE": config.LoginNodeInstanceType, - "LOGIN_NODE_IMAGE_NAME": config.LoginNodeImageName, - "MANAGEMENT_IMAGE_NAME": config.ManagementImageName, - "COMPUTE_IMAGE_NAME": config.ComputeImageName, - "MANAGEMENT_NODE_INSTANCE_TYPE": config.ManagementNodeInstanceType, - "MANAGEMENT_NODE_COUNT": config.ManagementNodeCount, - "ENABLE_VPC_FLOW_LOGS": config.EnableVPCFlowLogs, - "KEY_MANAGEMENT": config.KeyManagement, - "KMS_INSTANCE_NAME": config.KMSInstanceName, - "KMS_KEY_NAME": config.KMSKeyName, - "HYPERTHREADING_ENABLED": config.HyperthreadingEnabled, - "DNS_DOMAIN_NAME": config.DnsDomainName, - "ENABLE_APP_CENTER": config.EnableAppCenter, - "APP_CENTER_GUI_PASSWORD": config.AppCenterGuiPassword, //pragma: allowlist secret - "ENABLE_LDAP": config.EnableLdap, - "LDAP_BASEDNS": config.LdapBaseDns, - "LDAP_SERVER": config.LdapServer, - "LDAP_ADMIN_PASSWORD": config.LdapAdminPassword, //pragma: allowlist secret - "LDAP_USER_NAME": config.LdapUserName, - "LDAP_USER_PASSWORD": config.LdapUserPassword, //pragma: allowlist secret - "US_EAST_ZONE": config.USEastZone, - "US_EAST_RESERVATION_ID": config.USEastReservationID, - "US_EAST_CLUSTER_NAME": config.USEastClusterName, - "EU_DE_ZONE": config.EUDEZone, - "EU_DE_RESERVATION_ID": config.EUDEReservationID, - "EU_DE_CLUSTER_NAME": config.EUDEClusterName, - "US_SOUTH_ZONE": config.USSouthZone, - "US_SOUTH_RESERVATION_ID": config.USSouthReservationID, - "US_SOUTH_CLUSTER_NAME": config.USSouthClusterName, - "JP_TOK_ZONE": config.JPTokZone, - "JP_TOK_RESERVATION_ID": config.JPTokReservationID, - "JP_TOK_CLUSTER_NAME": config.JPTokClusterName, - "SSH_FILE_PATH": config.SSHFilePath, - "SSH_FILE_PATH_TWO": config.SSHFilePathTwo, - "SOLUTION": config.Solution, - "WORKER_NODE_MAX_COUNT": config.WorkerNodeMaxCount, //LSF specific parameter - "SCC_ENABLED": config.SccEnabled, - "SCC_EVENT_NOTIFICATION_PLAN": config.SccEventNotificationPlan, - "SCC_LOCATION": config.SccLocation, - "OBSERVABILITY_MONITORING_ENABLE": config.ObservabilityMonitoringEnable, - "OBSERVABILITY_MONITORING_ON_COMPUTE_NODES_ENABLE": config.ObservabilityMonitoringOnComputeNodesEnable, - } - - // Format WorkerNodeInstanceType into JSON string - if len(config.WorkerNodeInstanceType) > 0 { - var formattedWorkerNodeInstanceType []map[string]interface{} - for _, workerNode := range config.WorkerNodeInstanceType { - // If instance_type is empty, provide a default - if workerNode.InstanceType == "" { - fmt.Printf("Warning: WorkerNode InstanceType is empty, setting to default\n") - } - - node := map[string]interface{}{ - "count": workerNode.Count, - "instance_type": workerNode.InstanceType, - } - formattedWorkerNodeInstanceType = append(formattedWorkerNodeInstanceType, node) - } - - // Marshal to JSON string - workerNodeInstanceTypeJSON, err := json.Marshal(formattedWorkerNodeInstanceType) - if err != nil { - return fmt.Errorf("failed to marshal WORKER_NODE_INSTANCE_TYPE: %v", err) - } - - envVars["WORKER_NODE_INSTANCE_TYPE"] = string(workerNodeInstanceTypeJSON) - } else { - envVars["WORKER_NODE_INSTANCE_TYPE"] = "[]" // Empty array if not set - } - - // Set environment variables - for key, value := range envVars { - val, ok := os.LookupEnv(key) - switch { - case strings.Contains(key, "KEY_MANAGEMENT") && val == "null" && ok: - if err := os.Setenv(key, "null"); err != nil { - return fmt.Errorf("failed to set %s to 'null': %v", key, err) - } - case strings.Contains(key, "REMOTE_ALLOWED_IPS") && !ok && value == "": - if err := os.Setenv(key, ip); err != nil { - return fmt.Errorf("failed to set %s to %s: %v", key, ip, err) - } - case value != "" && !ok: - switch v := value.(type) { - case string: - if err := os.Setenv(key, v); err != nil { - return fmt.Errorf("failed to set %s to %s: %v", key, v, err) - } - case bool: - if err := os.Setenv(key, fmt.Sprintf("%t", v)); err != nil { - return fmt.Errorf("failed to set %s to %t: %v", key, v, err) - } - case int: - if err := os.Setenv(key, fmt.Sprintf("%d", v)); err != nil { - return fmt.Errorf("failed to set %s to %d: %v", key, v, err) - } - case float64: - if err := os.Setenv(key, fmt.Sprintf("%f", v)); err != nil { - return fmt.Errorf("failed to set %s to %f: %v", key, v, err) - } - case []string: - if err := os.Setenv(key, strings.Join(v, ",")); err != nil { - return fmt.Errorf("failed to set %s to joined string: %v", key, err) - } - case []WorkerNode: - workerNodeInstanceTypeJSON, err := json.Marshal(v) - if err != nil { - return fmt.Errorf("failed to marshal %s: %v", key, err) - } - if err := os.Setenv(key, string(workerNodeInstanceTypeJSON)); err != nil { - return fmt.Errorf("failed to set %s to JSON: %v", key, err) - } - default: - return fmt.Errorf("unsupported type for key %s", key) - } - } - } - - // Handle missing reservation IDs if necessary - for key, value := range envVars { - _, ok := os.LookupEnv(key) - switch { - case key == "RESERVATION_ID" && !ok && value == "": - val := GetValueForKey( - map[string]string{ - "us-south": reservationIDSouth, - "us-east": reservationIDEast, - }, - strings.ToLower(GetRegion(os.Getenv("ZONE"))), - ) - if err := os.Setenv("RESERVATION_ID", val); err != nil { - return fmt.Errorf("failed to set RESERVATION_ID: %v", err) - } - case key == "US_EAST_RESERVATION_ID" && !ok && value == "": - if err := os.Setenv("US_EAST_RESERVATION_ID", reservationIDEast); err != nil { - return fmt.Errorf("failed to set US_EAST_RESERVATION_ID: %v", err) - } - case key == "EU_DE_RESERVATION_ID" && !ok && value == "": - if err := os.Setenv("EU_DE_RESERVATION_ID", reservationIDEast); err != nil { - return fmt.Errorf("failed to set EU_DE_RESERVATION_ID: %v", err) - } - case key == "US_SOUTH_RESERVATION_ID" && !ok && value == "": - if err := os.Setenv("US_SOUTH_RESERVATION_ID", reservationIDSouth); err != nil { - return fmt.Errorf("failed to set US_SOUTH_RESERVATION_ID: %v", err) - } - } - } - - return nil -} diff --git a/tests/utilities/fileops.go b/tests/utilities/fileops.go index b1a535b9..f5359600 100644 --- a/tests/utilities/fileops.go +++ b/tests/utilities/fileops.go @@ -157,7 +157,8 @@ func ToCreateFileWithContent(t *testing.T, sClient *ssh.Client, filePath, fileNa if isPathExist { // Path exists, create the file using SSH command - command := "cd " + filePath + " && echo '" + content + "' > " + fileName + //command := "cd " + filePath + " && echo '" + content + "' > " + fileName + command := fmt.Sprintf("cd %s && echo %q > %s", filePath, content, fileName) _, createFileErr := RunCommandInSSHSession(sClient, command) if createFileErr == nil { diff --git a/tests/utilities/helpers.go b/tests/utilities/helpers.go index 3b5944d0..e0572011 100644 --- a/tests/utilities/helpers.go +++ b/tests/utilities/helpers.go @@ -132,6 +132,15 @@ func LogVerificationResult(t *testing.T, err error, checkName string, logger *Ag } } +// Add this to your logger package or test utilities +func LogValidationResult(t *testing.T, success bool, message string, l *AggregatedLogger) { + if success { + l.PASS(t, fmt.Sprintf("Validation succeeded: %s", message)) + } else { + l.FAIL(t, fmt.Sprintf("Validation failed: %s", message)) + } +} + // ParsePropertyValue parses the content of a string, searching for a property with the specified key. // It returns the value of the property if found, or an empty string and an error if the property is not found. func ParsePropertyValue(content, propertyKey string) (string, error) { @@ -374,44 +383,82 @@ func GetBastionIP(t *testing.T, options *testhelper.TestOptions, logger *Aggrega return bastionIP, nil } -// GetValueFromIniFile retrieves a value from an INI file based on the provided section and key. -// It reads the specified INI file, extracts the specified section, and returns the value associated with the key. -func GetValueFromIniFile(filePath, sectionName string) ([]string, error) { - // Read the content of the file +// // GetValueFromIniFile retrieves a value from an INI file based on the provided section and key. +// // It reads the specified INI file, extracts the specified section, and returns the value associated with the key. +// func GetValueFromIniFile(filePath, sectionName string) ([]string, error) { +// // Read the content of the file +// absolutePath, err := filepath.Abs(filePath) +// if err != nil { +// return nil, err +// } +// data, err := os.ReadFile(absolutePath) +// if err != nil { +// return nil, err +// } + +// // Convert the byte slice to a string +// content := string(data) + +// // Split the input into sections based on empty lines +// sections := strings.Split(content, "\n\n") + +// // Loop through sections and find the one with the specified sectionName +// for _, section := range sections { + +// if strings.Contains(section, "["+sectionName+"]") { +// // Split the section into lines +// lines := strings.Split(section, "\n") + +// // Extract values +// var sectionValues []string +// for i := 1; i < len(lines); i++ { +// // Skip the first line, as it contains the section name +// sectionValues = append(sectionValues, strings.TrimSpace(lines[i])) +// } + +// return sectionValues, nil +// } +// } + +// return nil, fmt.Errorf("section [%s] not found in file %s", sectionName, filePath) +// } + +// GetValueFromIniFile reads a file containing IP addresses, one per line (with optional trailing '%'). +// It returns a slice of clean IP addresses and any error encountered. +func GetValueFromIniFile(filePath string) ([]string, error) { absolutePath, err := filepath.Abs(filePath) if err != nil { return nil, err } - data, err := os.ReadFile(absolutePath) + + file, err := os.Open(absolutePath) if err != nil { return nil, err } - // Convert the byte slice to a string - content := string(data) - - // Split the input into sections based on empty lines - sections := strings.Split(content, "\n\n") - - // Loop through sections and find the one with the specified sectionName - for _, section := range sections { - - if strings.Contains(section, "["+sectionName+"]") { - // Split the section into lines - lines := strings.Split(section, "\n") + defer func() { + if cerr := file.Close(); cerr != nil { + // handle error, log or override return err + fmt.Printf("Error closing file: %v\n", cerr) + } + }() - // Extract values - var sectionValues []string - for i := 1; i < len(lines); i++ { - // Skip the first line, as it contains the section name - sectionValues = append(sectionValues, strings.TrimSpace(lines[i])) - } + var ipAddresses []string + scanner := bufio.NewScanner(file) - return sectionValues, nil + for scanner.Scan() { + ip := strings.TrimSpace(scanner.Text()) + ip = strings.TrimSuffix(ip, "%") // Remove trailing % if present + if ip != "" { + ipAddresses = append(ipAddresses, ip) } } - return nil, fmt.Errorf("section [%s] not found in file %s", sectionName, filePath) + if err := scanner.Err(); err != nil { + return nil, err + } + + return ipAddresses, nil } // GetRegion returns the region from a given zone. @@ -509,66 +556,93 @@ func ConvertToInt(value interface{}) (int, error) { } } -// GetTotalWorkerNodeCount extracts and sums up all "count" values. -func GetTotalWorkerNodeCount(t *testing.T, terraformVars map[string]interface{}, logger *AggregatedLogger) (int, error) { - rawVal, exists := terraformVars["worker_node_instance_type"] +// GetTotalStaticComputeCount extracts and sums all "count" values from the "static_compute_instances" variable. +// It logs progress and errors using the provided logger and test context. +func GetTotalStaticComputeCount(t *testing.T, terraformVars map[string]interface{}, logger *AggregatedLogger) (int, error) { + if logger == nil { + return 0, fmt.Errorf("logger cannot be nil") + } + + rawVal, exists := terraformVars["static_compute_instances"] if !exists { - return 0, errors.New("worker_node_instance_type key does not exist") + err := fmt.Errorf("static_compute_instances key does not exist") + logger.Error(t, err.Error()) + return 0, err } - // Ensure rawVal is of type []map[string]interface{} - workers, ok := rawVal.([]map[string]interface{}) + instances, ok := rawVal.([]map[string]interface{}) if !ok { - return 0, fmt.Errorf("worker_node_instance_type is not a slice, but %T", rawVal) + err := fmt.Errorf("static_compute_instances is not a slice of maps (got %T)", rawVal) + logger.Error(t, err.Error()) + return 0, err } - var totalCount int - for i, worker := range workers { - countVal, exists := worker["count"] + if len(instances) == 0 { + logger.Warn(t, "static_compute_instances is empty (count = 0)") + return 0, nil + } + + var total int + for i, inst := range instances { + countVal, exists := inst["count"] if !exists { - return 0, fmt.Errorf("worker at index %d is missing 'count' key", i) + err := fmt.Errorf("instance at index %d is missing 'count' key", i) + logger.Error(t, err.Error()) + return 0, err } - count, err := ConvertToInt(countVal) - if err != nil { - return 0, fmt.Errorf("worker at index %d has invalid 'count' value: %v", i, err) + switch v := countVal.(type) { + case int: + total += v + case float64: + total += int(v) + case json.Number: + n, err := v.Int64() + if err != nil { + logger.Error(t, fmt.Sprintf("instance %d: count is not an integer (got %v)", i, v)) + return 0, fmt.Errorf("instance %d: invalid count: %v", i, err) + } + total += int(n) + default: + err := fmt.Errorf("instance %d: 'count' must be a number (got %T)", i, countVal) + logger.Error(t, err.Error()) + return 0, err } - totalCount += count } - logger.Info(t, fmt.Sprintf("Total Worker Node Count: %d", totalCount)) - return totalCount, nil + logger.Info(t, fmt.Sprintf("Successfully summed counts: total = %d", total)) + return total, nil } -// GetFirstWorkerNodeInstanceType retrieves the "instance_type" of the first worker node. -func GetFirstWorkerNodeInstanceType(t *testing.T, terraformVars map[string]interface{}, logger *AggregatedLogger) (string, error) { - rawVal, exists := terraformVars["worker_node_instance_type"] +// GetFirstDynamicComputeProfile retrieves the "profile" of the first dynamic compute instance. +func GetFirstDynamicComputeProfile(t *testing.T, terraformVars map[string]interface{}, logger *AggregatedLogger) (string, error) { + rawVal, exists := terraformVars["dynamic_compute_instances"] if !exists { - return "", errors.New("worker_node_instance_type key does not exist") + return "", errors.New("dynamic_compute_instances key does not exist") } // Ensure rawVal is of type []map[string]interface{} - workers, ok := rawVal.([]map[string]interface{}) + instances, ok := rawVal.([]map[string]interface{}) if !ok { - return "", fmt.Errorf("worker_node_instance_type is not a slice, but %T", rawVal) + return "", fmt.Errorf("dynamic_compute_instances is not a slice, but %T", rawVal) } - if len(workers) == 0 { - return "", errors.New("worker_node_instance_type is empty") + if len(instances) == 0 { + return "", errors.New("dynamic_compute_instances is empty") } - instanceType, exists := workers[0]["instance_type"] + profile, exists := instances[0]["profile"] if !exists { - return "", errors.New("first worker node is missing 'instance_type' key") + return "", errors.New("first dynamic compute instance is missing 'profile' key") } - instanceTypeStr, ok := instanceType.(string) + profileStr, ok := profile.(string) if !ok { - return "", errors.New("instance_type is not a string") + return "", errors.New("'profile' is not a string") } - logger.Info(t, fmt.Sprintf("First Worker Node Instance Type: %s", instanceTypeStr)) - return instanceTypeStr, nil + logger.Info(t, fmt.Sprintf("First Dynamic Compute Profile: %s", profileStr)) + return profileStr, nil } // RunCommandWithRetry executes a shell command with retries @@ -588,3 +662,110 @@ func RunCommandWithRetry(cmd string, retries int, delay time.Duration) ([]byte, return output, err } + +// TrimTrailingWhitespace removes any trailing whitespace characters (spaces, tabs, carriage returns, and newlines) +// from the end of the provided string. It returns the trimmed string. +func TrimTrailingWhitespace(content string) string { + return strings.TrimRight(content, " \t\r\n") +} + +// RemoveDuplicateIPs filters out duplicate IPs from the input slice. +func RemoveDuplicateIPs(ips []string) []string { + seen := make(map[string]struct{}, len(ips)) + unique := make([]string, 0, len(ips)) + + for _, ip := range ips { + if _, exists := seen[ip]; !exists { + seen[ip] = struct{}{} + unique = append(unique, ip) + } + } + return unique +} + +// GetVar retrieves any variable value from the map while preserving its original type. +func GetVar(vars map[string]interface{}, key string) interface{} { + if val, ok := vars[key]; ok { + return val + } + return nil +} + +// GetStringVar converts a map value to string, handling nil and "null" (as a string) as empty string. +func GetStringVar(vars map[string]interface{}, key string) string { + val := GetVar(vars, key) + + if val == nil { + return "" + } + + if s, ok := val.(string); ok { + if s == "null" { + return "" + } + return s + } + + // Fallback: convert other types (e.g., bool, int, float) + return fmt.Sprintf("%v", val) +} + +// GetStringVarWithDefault returns a string value or a default if nil, empty, or "null". +func GetStringVarWithDefault(vars map[string]interface{}, key, defaultValue string) string { + val := GetStringVar(vars, key) + if val != "" { + return val + } + return defaultValue +} + +// GenerateLDAPPasswordHash generates an SSHA hashed password using slappasswd on a remote SSH session. +func GenerateLDAPPasswordHash(t *testing.T, sClient *ssh.Client, password string, logger *AggregatedLogger) (string, error) { + // Security check - don't allow empty passwords + if password == "" { + return "", fmt.Errorf("password cannot be empty") + } + + // Safely wrap password to prevent shell injection + cmd := fmt.Sprintf("slappasswd -s '%s'", password) + + logger.Info(t, "Generating LDAP password hash via slappasswd") + + output, err := RunCommandInSSHSession(sClient, cmd) + if err != nil { + return "", fmt.Errorf("failed to generate password hash: %w", err) + } + + // Clean and validate output + hashedPassword := strings.TrimSpace(string(output)) + if !strings.HasPrefix(hashedPassword, "{SSHA}") { + return "", fmt.Errorf("invalid hash format generated: %s", hashedPassword) + } + + return hashedPassword, nil +} + +// ExtractTerraformValue splits a terraform output line by "=" and trims the result +func ExtractTerraformValue(line string) string { + parts := strings.SplitN(line, "=", 2) + if len(parts) < 2 { + return "" + } + return strings.Trim(strings.TrimSpace(parts[1]), `"`) +} + +// GetBoolVar fetches a boolean from the map by key and returns an error if missing or invalid. +// Returns the boolean value and error status. +func GetBoolVar(vars map[string]interface{}, key string) (bool, error) { + val, exists := vars[key] + if !exists { + return false, fmt.Errorf("missing bool var: %q", key) + } + + boolVal, ok := val.(bool) + if !ok { + return false, fmt.Errorf("invalid bool var: %q (got type %T, expected bool)", key, val) + } + + return boolVal, nil +} diff --git a/tests/utilities/logging.go b/tests/utilities/logging.go index 13460355..726cea63 100644 --- a/tests/utilities/logging.go +++ b/tests/utilities/logging.go @@ -1,84 +1,118 @@ package tests import ( + "fmt" + "io" "log" "os" "path/filepath" "testing" "time" +) + +// LogLevel represents different logging levels +type LogLevel string - "github.com/gruntwork-io/terratest/modules/logger" +const ( + LevelInfo LogLevel = "INFO" + LevelWarn LogLevel = "WARN" + LevelError LogLevel = "ERROR" + LevelPass LogLevel = "PASS" + LevelFail LogLevel = "FAIL" + LevelDebug LogLevel = "DEBUG" ) -// AggregatedLogger represents an aggregated logger with different log levels. +// AggregatedLogger provides multi-level logging capabilities type AggregatedLogger struct { - infoLogger *log.Logger - warnLogger *log.Logger - errorLogger *log.Logger - passLogger *log.Logger - failLogger *log.Logger + loggers map[LogLevel]*log.Logger + file *os.File } -// NewAggregatedLogger creates a new instance of AggregatedLogger. +// NewAggregatedLogger creates a new logger instance with file output func NewAggregatedLogger(logFileName string) (*AggregatedLogger, error) { - - absPath, err := filepath.Abs("logs") - if err != nil { - return nil, err + // Ensure logs directory exists + logsDir := filepath.Join("..", "logs_output") + if err := os.MkdirAll(logsDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create logs directory: %w", err) } - file, err := os.Create(filepath.Join(absPath, logFileName)) + // Create log file + filePath := filepath.Join(logsDir, logFileName) + file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to create log file: %w", err) } + // Create multi-writer for console and file output + multiWriter := io.MultiWriter(os.Stdout, file) + return &AggregatedLogger{ - infoLogger: log.New(file, "", 0), - warnLogger: log.New(file, "", 0), - errorLogger: log.New(file, "", 0), - passLogger: log.New(file, "", 0), - failLogger: log.New(file, "", 0), + loggers: map[LogLevel]*log.Logger{ + LevelInfo: log.New(multiWriter, string(LevelInfo)+" ", log.Lmsgprefix), + LevelWarn: log.New(multiWriter, string(LevelWarn)+" ", log.Lmsgprefix), + LevelError: log.New(multiWriter, string(LevelError)+" ", log.Lmsgprefix), + LevelPass: log.New(multiWriter, string(LevelPass)+" ", log.Lmsgprefix), + LevelFail: log.New(multiWriter, string(LevelFail)+" ", log.Lmsgprefix), + LevelDebug: log.New(multiWriter, string(LevelDebug)+" ", log.Lmsgprefix), + }, + file: file, }, nil } -// getLogArgs is a helper function to generate common log arguments. -func getLogArgs(t *testing.T, message string) []interface{} { - return []interface{}{ - time.Now().Format("2006-01-02 15:04:05"), - t.Name(), - message, +// Close releases resources used by the logger +func (l *AggregatedLogger) Close() error { + if l.file != nil { + return l.file.Close() } + return nil } -// Info logs informational messages. +// logInternal is the internal logging function +func (l *AggregatedLogger) logInternal(t *testing.T, level LogLevel, message string) { + if logger, exists := l.loggers[level]; exists { + logger.Printf("[%s] [%s] %s", + time.Now().Format("2006-01-02 15:04:05"), + t.Name(), + message, + ) + } +} + +// Info logs informational messages func (l *AggregatedLogger) Info(t *testing.T, message string) { - format := "[%s] [INFO] [%s] : %v\n" - l.infoLogger.Printf(format, getLogArgs(t, message)...) + l.logInternal(t, LevelInfo, message) } -// Warn logs warning messages. +// Warn logs warning messages func (l *AggregatedLogger) Warn(t *testing.T, message string) { - format := "[%s] [WARN] [%s] : %v\n" - l.warnLogger.Printf(format, getLogArgs(t, message)...) + l.logInternal(t, LevelWarn, message) } -// Error logs error messages. +// Error logs error messages func (l *AggregatedLogger) Error(t *testing.T, message string) { - format := "[%s] [ERROR] [%s] : %v\n" - l.errorLogger.Printf(format, getLogArgs(t, message)...) - logger.Log(t, getLogArgs(t, message)...) + l.logInternal(t, LevelError, message) } -// Error logs error messages. +// PASS logs successful test messages func (l *AggregatedLogger) PASS(t *testing.T, message string) { - format := "[%s] [PASS] [%s] : %v\n" - l.passLogger.Printf(format, getLogArgs(t, message)...) - logger.Log(t, getLogArgs(t, message)...) + l.logInternal(t, LevelPass, message) } -// Error logs error messages. +// FAIL logs failed test messages func (l *AggregatedLogger) FAIL(t *testing.T, message string) { - format := "[%s] [FAIL] [%s] : %v\n" - l.failLogger.Printf(format, getLogArgs(t, message)...) - logger.Log(t, getLogArgs(t, message)...) + l.logInternal(t, LevelFail, message) +} + +// DEBUG logs debugging messages +func (l *AggregatedLogger) DEBUG(t *testing.T, message string) { + l.logInternal(t, LevelDebug, message) +} + +// LogValidationResult provides a consistent way to log validation results +func (l *AggregatedLogger) LogValidationResult(t *testing.T, success bool, message string) { + if success { + l.PASS(t, fmt.Sprintf("Validation succeeded : %s", message)) + } else { + l.FAIL(t, fmt.Sprintf("Validation failed : %s", message)) + } } diff --git a/tests/utilities/report.go b/tests/utilities/report.go index 20b9072b..cbb0fd2f 100644 --- a/tests/utilities/report.go +++ b/tests/utilities/report.go @@ -11,218 +11,324 @@ import ( "time" ) -// TestResult holds the result of a single test +// TestResult holds the result of a single test case type TestResult struct { - Test string // Test case name - Action string // PASS or FAIL - Elapsed float64 // Time elapsed in seconds + Test string `json:"test"` // Name of the test case + Action string `json:"action"` // Test outcome (PASS/FAIL) + Elapsed float64 `json:"elapsed"` // Duration in minutes (updated from seconds) } -// ReportData holds the data for the HTML report +// ReportData contains all data needed to generate the HTML report type ReportData struct { - Tests []TestResult // List of test results - TotalTests int // Total number of tests - TotalPass int // Total number of passing tests - TotalFail int // Total number of failing tests - TotalTime float64 // Total time taken for all tests - ChartData string // JSON data for charts - DateTime string // Date and time of report generation + Tests []TestResult `json:"tests"` // Individual test results + TotalTests int `json:"totalTests"` // Total number of tests + TotalPass int `json:"totalPass"` // Number of passed tests + TotalFail int `json:"totalFail"` // Number of failed tests + TotalTime float64 `json:"totalTime"` // Total execution time (now in minutes) + ChartData string `json:"chartData"` // JSON data for charts + DateTime string `json:"dateTime"` // Report generation timestamp } -// ParseLogFile parses the log file and generates a list of TestResult +// ParseJSONFile reads and parses a JSON test log file into TestResult structures func ParseJSONFile(fileName string) ([]TestResult, error) { file, err := os.Open(fileName) if err != nil { return nil, fmt.Errorf("error opening log file: %w", err) } + defer closeFile(file, fileName) - var returnErr error - defer func() { - if cerr := file.Close(); cerr != nil && returnErr == nil { - returnErr = fmt.Errorf("failed to close file: %w", cerr) - } - }() - - // Regular expression to capture results + // Regex to match test result lines (e.g., "--- PASS: TestSomething (0.45s)") reTestResult := regexp.MustCompile(`--- (PASS|FAIL): (\S+) \((\d+\.\d+)s\)`) - var results []TestResult - var testName string scanner := bufio.NewScanner(file) for scanner.Scan() { - line := scanner.Text() - - if matches := reTestResult.FindStringSubmatch(line); matches != nil { - // Ensure that the testName is set - if testName == "" { - testName = matches[2] // Use the test name from the result line as a fallback - } - elapsed := parseElapsed(matches[3]) + if matches := reTestResult.FindStringSubmatch(scanner.Text()); matches != nil { results = append(results, TestResult{ - Test: testName, + Test: matches[2], Action: matches[1], - Elapsed: elapsed, + Elapsed: parseElapsedTime(matches[3]), // Now stores minutes }) - // Reset the testName after it has been used - testName = "" } } if err := scanner.Err(); err != nil { - return nil, fmt.Errorf("error reading file: %w", err) + return nil, fmt.Errorf("error reading log file: %w", err) } return results, nil } -// Convert elapsed time from string to float64 -func parseElapsed(elapsedStr string) float64 { - var elapsed float64 - _, err := fmt.Sscanf(elapsedStr, "%f", &elapsed) +// GenerateHTMLReport creates an HTML report from test results +func GenerateHTMLReport(results []TestResult) error { + if len(results) == 0 { + return fmt.Errorf("no test results to report") + } + + // Calculate report statistics + stats := calculateStats(results) + + // Prepare chart data + chartData := map[string]interface{}{ + "labels": []string{"PASS", "FAIL"}, + "data": []int{stats.totalPass, stats.totalFail}, + } + chartDataJSON, err := json.Marshal(chartData) if err != nil { - fmt.Printf("Error parsing elapsed time : %s", err) - return 0.0 + return fmt.Errorf("error marshaling chart data: %w", err) + } + + // Prepare report data + reportData := ReportData{ + Tests: results, + TotalTests: stats.totalTests, + TotalPass: stats.totalPass, + TotalFail: stats.totalFail, + TotalTime: stats.totalTime, + ChartData: string(chartDataJSON), + DateTime: time.Now().Format("2006-01-02 15:04:05"), } - return elapsed + + // Generate and write the report + return writeReport(reportData) } -// GenerateHTMLReport generates an HTML report from the test results -func GenerateHTMLReport(results []TestResult) error { - totalTests := len(results) - totalPass := 0 - totalFail := 0 - totalTime := 0.0 +// reportStats holds calculated statistics for the report +type reportStats struct { + totalTests int + totalPass int + totalFail int + totalTime float64 // Now in minutes +} + +// calculateStats computes summary statistics from test results +func calculateStats(results []TestResult) reportStats { + var stats reportStats + stats.totalTests = len(results) + for _, result := range results { switch result.Action { case "PASS": - totalPass++ + stats.totalPass++ case "FAIL": - totalFail++ + stats.totalFail++ } - totalTime += result.Elapsed + stats.totalTime += result.Elapsed // Already in minutes } - // Prepare chart data - chartData := map[string]interface{}{ - "labels": []string{"PASS", "FAIL"}, - "data": []int{totalPass, totalFail}, + return stats +} + +// writeReport generates and writes the HTML report file +func writeReport(data ReportData) error { + tmpl, err := template.New("report").Parse(reportTemplate) + if err != nil { + return fmt.Errorf("template parsing failed: %w", err) } - chartDataJSON, err := json.Marshal(chartData) + + reportFileName := getReportFileName() + reportFile, err := os.Create(reportFileName) if err != nil { - return fmt.Errorf("error marshaling chart data: %w", err) + return fmt.Errorf("error creating report file: %w", err) } + defer closeFile(reportFile, reportFileName) - currentTime := time.Now().Format("2006-01-02 15:04:05") + // Execute template with cleaned content + cleanedContent := cleanTemplateOutput(tmpl, data) + if _, err := reportFile.WriteString(cleanedContent); err != nil { + return fmt.Errorf("error writing report: %w", err) + } - reportData := ReportData{ - Tests: results, - TotalTests: totalTests, - TotalPass: totalPass, - TotalFail: totalFail, - TotalTime: totalTime, - ChartData: string(chartDataJSON), - DateTime: currentTime, + fmt.Printf("✅ HTML report generated: %s\n", reportFileName) + return nil +} + +// getReportFileName determines the output filename for the report +func getReportFileName() string { + if logFile, ok := os.LookupEnv("LOG_FILE_NAME"); ok { + return strings.TrimSuffix(logFile, ".json") + ".html" } + return "test-report-" + time.Now().Format("20060102-150405") + ".html" +} - htmlTemplate := ` - +// parseElapsedTime converts elapsed time string to float64 (now in minutes) +func parseElapsedTime(elapsedStr string) float64 { + var seconds float64 + if _, err := fmt.Sscanf(elapsedStr, "%f", &seconds); err != nil { + fmt.Fprintf(os.Stderr, "warning: failed to parse elapsed time '%s': %v\n", elapsedStr, err) + return 0 + } + return seconds / 60 // Convert seconds → minutes +} + +// cleanTemplateOutput processes template output for better HTML formatting +func cleanTemplateOutput(tmpl *template.Template, data interface{}) string { + var sb strings.Builder + if err := tmpl.Execute(&sb, data); err != nil { + fmt.Fprintf(os.Stderr, "warning: template execution error: %v\n", err) + return "" + } + + // Clean up whitespace and newlines + reNewline := regexp.MustCompile(`[\r\n]+`) + noNewlines := reNewline.ReplaceAllString(sb.String(), " ") + reWhitespace := regexp.MustCompile(`\s+`) + cleaned := reWhitespace.ReplaceAllString(noNewlines, " ") + return strings.TrimSpace(cleaned) + "\n" +} + +// closeFile safely closes a file and logs any errors +func closeFile(file *os.File, fileName string) { + if err := file.Close(); err != nil { + fmt.Fprintf(os.Stderr, "warning: failed to close file %s: %v\n", fileName, err) + } +} + +// reportTemplate is the HTML template for the test report (updated to show "mins") +const reportTemplate = ` HPC Test Summary Report + +

HPC Test Summary Report

+
-

Date and Time: {{.DateTime}}

-

Total Tests: {{.TotalTests}}

-

Total Pass: {{.TotalPass}}

-

Total Fail: {{.TotalFail}}

-

Total Time Taken: {{printf "%.2f" .TotalTime}} seconds

+

Report Summary

+

Generated on: {{.DateTime}}

+ +
+
+
Total Tests
+
{{.TotalTests}}
+
+
+
Passed
+
{{.TotalPass}}
+
+
+
Failed
+
{{.TotalFail}}
+
+
+
Total Time
+
{{printf "%.2f" .TotalTime}} mins
+
+
+
+ +
+
-
- +

Detailed Test Results

+
+ - + + + {{range .Tests}} - + {{end}} -
Test Name StatusElapsed Time (s)Duration (mins)
{{.Test}} {{.Action}}{{printf "%.2f" .Elapsed}}{{printf "%.3f" .Elapsed}}
- -
- -
-
+ + - - - ` - - // Parse and execute the HTML template - tmpl, err := template.New("report").Parse(htmlTemplate) - if err != nil { - return fmt.Errorf("error creating template: %w", err) - } - - reportFileName, ok := os.LookupEnv("LOG_FILE_NAME") - if ok { - getFileName := strings.Split(reportFileName, ".")[0] - // Create or overwrite the report file - reportFile, err := os.Create(getFileName + ".html") - if err != nil { - return fmt.Errorf("error creating report file: %w", err) - } - - var returnErr error - defer func() { - if cerr := reportFile.Close(); cerr != nil && returnErr == nil { - returnErr = fmt.Errorf("failed to close report file: %w", cerr) - } - }() - - // Execute the template with the data - err = tmpl.Execute(reportFile, reportData) - if err != nil { - return fmt.Errorf("error generating report: %w", err) - } - fmt.Printf("HTML report generated: %s.html\n", getFileName) - } - return nil -} +` diff --git a/tests/utilities/resources.go b/tests/utilities/resources.go index 6f7f5291..e23805f0 100644 --- a/tests/utilities/resources.go +++ b/tests/utilities/resources.go @@ -3,7 +3,9 @@ package tests import ( "bytes" "fmt" + "os" "os/exec" + "path/filepath" "strings" "testing" @@ -41,54 +43,120 @@ func IsVPCExist(vpcName string) (bool, error) { return bytes.Contains(output, []byte(vpcName)), nil } +// // GetBastionServerIP retrieves the IP address from the BastionServer section in the specified INI file. +// func GetBastionServerIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { +// value, err := GetValueFromIniFile(filePath+"/bastion.ini", "BastionServer") +// if err != nil { +// return "", fmt.Errorf("failed to get value from bastion.ini: %w", err) +// } +// logger.Info(t, fmt.Sprintf("Bastion Server IP: %s", value[1])) +// return value[1], nil +// } + +// // GetManagementNodeIPs retrieves the IP addresses from the HPCAASCluster section in the specified INI file. +// func GetManagementNodeIPsFromIni(t *testing.T, filePath string, logger *AggregatedLogger) ([]string, error) { +// value, err := GetValueFromIniFile(filePath+"/compute.ini", "HPCAASCluster") +// if err != nil { +// return nil, fmt.Errorf("failed to get value from compute.ini: %w", err) +// } +// logger.Info(t, fmt.Sprintf("Management Node IPs List: %q", value[1:])) +// return value[1:], nil +// } + +// // GetLoginNodeIP retrieves the IP address from the LoginServer section in the specified login INI file. +// func GetLoginNodeIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { +// value, err := GetValueFromIniFile(filePath+"/login.ini", "LoginServer") +// if err != nil { +// return "", fmt.Errorf("failed to get value from login.ini: %w", err) +// } +// logger.Info(t, fmt.Sprintf("Login Server IP: %s", value[1])) +// return value[1], nil +// } + +// // GetLdapServerIP retrieves the IP address from the LdapServer section in the specified login INI file. +// func GetLdapServerIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { +// value, err := GetValueFromIniFile(filePath+"/ldap.ini", "LDAPServer") +// if err != nil { +// return "", fmt.Errorf("failed to get value from ldap.ini: %w", err) +// } +// logger.Info(t, fmt.Sprintf("Ldap Server IP: %s", value[1])) +// return value[1], nil +// } + +// // GetWorkerNodeIPsFromIni retrieves the IP address from the WorkerServer section in the specified login INI file. +// func GetWorkerNodeIPsFromIni(t *testing.T, filePath string, logger *AggregatedLogger) ([]string, error) { +// value, err := GetValueFromIniFile(filePath+"/worker.ini", "WorkerServer") +// if err != nil { +// return nil, fmt.Errorf("failed to get value from worker.ini: %w", err) +// } +// logger.Info(t, fmt.Sprintf("Worker Node IPs List %q", value[1:])) +// return value[1:], nil +// } + // GetBastionServerIP retrieves the IP address from the BastionServer section in the specified INI file. func GetBastionServerIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { - value, err := GetValueFromIniFile(filePath+"/bastion.ini", "BastionServer") + + value, err := GetValueFromIniFile(filepath.Join(filePath, "bastion_hosts.ini")) if err != nil { - return "", fmt.Errorf("failed to get value from bastion.ini: %w", err) + return "", fmt.Errorf("failed to get value from bastion_hosts.ini : %w", err) } - logger.Info(t, fmt.Sprintf("Bastion Server IP: %s", value[1])) - return value[1], nil + logger.Info(t, fmt.Sprintf("Bastion Server IP: %s", value[0])) + return value[0], nil } // GetManagementNodeIPs retrieves the IP addresses from the HPCAASCluster section in the specified INI file. func GetManagementNodeIPsFromIni(t *testing.T, filePath string, logger *AggregatedLogger) ([]string, error) { - value, err := GetValueFromIniFile(filePath+"/compute.ini", "HPCAASCluster") + + value, err := GetValueFromIniFile(filepath.Join(filePath, "mgmt_hosts.ini")) if err != nil { - return nil, fmt.Errorf("failed to get value from compute.ini: %w", err) + return nil, fmt.Errorf("failed to get value from mgmt_hosts.ini : %w", err) } - logger.Info(t, fmt.Sprintf("Management Node IPs List: %q", value[1:])) - return value[1:], nil + logger.Info(t, fmt.Sprintf("Management Node IPs List: %q", value)) + return value, nil } -// GetLoginNodeIP retrieves the IP address from the LoginServer section in the specified login INI file. +// GetLoginNodeIP retrieves the IP address from the LoginServer section in the specified INI file. func GetLoginNodeIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { - value, err := GetValueFromIniFile(filePath+"/login.ini", "LoginServer") + + value, err := GetValueFromIniFile(filepath.Join(filePath, "login_host.ini")) + if err != nil { + return "", fmt.Errorf("failed to get value from login_host.ini : %w", err) + } + logger.Info(t, fmt.Sprintf("Login Server IP: %s", value[0])) + return value[0], nil +} + +// GetWorkerNodeIPsFromIni retrieves the IP address from the WorkerServer section in the specified INI file. +func GetWorkerNodeIPsFromIni(t *testing.T, filePath string, logger *AggregatedLogger) ([]string, error) { + + value, err := GetValueFromIniFile(filepath.Join(filePath, "compute_hosts.ini")) if err != nil { - return "", fmt.Errorf("failed to get value from login.ini: %w", err) + return nil, fmt.Errorf("failed to get value from compute_hosts.ini : %w", err) } - logger.Info(t, fmt.Sprintf("Login Server IP: %s", value[1])) - return value[1], nil + logger.Info(t, fmt.Sprintf("Worker Node IPs List %q", value)) + return value, nil } -// GetLdapServerIP retrieves the IP address from the LdapServer section in the specified login INI file. +// GetLdapServerIP retrieves the IP address from the LdapServer section in the specified INI file. func GetLdapServerIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { - value, err := GetValueFromIniFile(filePath+"/ldap.ini", "LDAPServer") + + value, err := GetValueFromIniFile(filepath.Join(filePath, "ldap_hosts.ini")) if err != nil { return "", fmt.Errorf("failed to get value from ldap.ini: %w", err) } - logger.Info(t, fmt.Sprintf("Ldap Server IP: %s", value[1])) - return value[1], nil + logger.Info(t, fmt.Sprintf("Ldap Server IP: %s", value[0])) + return value[0], nil } -// GetWorkerNodeIPsFromIni retrieves the IP address from the WorkerServer section in the specified login INI file. -func GetWorkerNodeIPsFromIni(t *testing.T, filePath string, logger *AggregatedLogger) ([]string, error) { - value, err := GetValueFromIniFile(filePath+"/worker.ini", "WorkerServer") +// GetDeployerNodeIPFromIni retrieves the IP address from the deployer section in the specified INI file. +func GetDeployerNodeIPFromIni(t *testing.T, filePath string, logger *AggregatedLogger) (string, error) { + + value, err := GetValueFromIniFile(filepath.Join(filePath, "deployer_hosts.ini")) if err != nil { - return nil, fmt.Errorf("failed to get value from worker.ini: %w", err) + return "", fmt.Errorf("failed to get value from deployer_hosts.ini: %w", err) } - logger.Info(t, fmt.Sprintf("Worker Node IPs List %q", value[1:])) - return value[1:], nil + logger.Info(t, fmt.Sprintf("Deployer Server IP: %s", value[0])) + return value[0], nil } // HPCGetClusterIPs retrieves the IP addresses of the bastion server, management nodes, and login node @@ -119,10 +187,29 @@ func HPCGetClusterIPs(t *testing.T, options *testhelper.TestOptions, logger *Agg return bastionIP, managementNodeIPList, loginNodeIP, nil } +// List files in directory +func ListFiles(dir string) { + fmt.Println("Listing files in:", dir) + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + fmt.Printf("Walk error on %s: %v\n", path, err) + return nil + } + if !info.IsDir() { + fmt.Printf("Found file: %s (%d bytes)\n", path, info.Size()) + } + return nil + }) + if err != nil { + fmt.Println("Walk error:", err) + } +} + // LSFGetClusterIPs retrieves the IP addresses of the bastion server, management nodes, and login node // from the specified file path in the provided test options, using the provided logger for logging. // It returns the bastion server IP, a list of management node IPs, the login node IP, and any error encountered. func LSFGetClusterIPs(t *testing.T, options *testhelper.TestOptions, logger *AggregatedLogger) (bastionIP string, managementNodeIPList []string, loginNodeIP string, workerNodeIPList []string, err error) { + // Retrieve the Terraform directory from the options. filePath := options.TerraformOptions.TerraformDir @@ -153,41 +240,6 @@ func LSFGetClusterIPs(t *testing.T, options *testhelper.TestOptions, logger *Agg return bastionIP, managementNodeIPList, loginNodeIP, workerNodeIPList, nil } -// HPCGetClusterIPsWithLDAP retrieves the IP addresses of various servers, including the LDAP server. -// from the specified file path in the provided test options, using the provided logger for logging. -// It returns the bastion server IP, a list of management node IPs, the login node IP, ldap server IP and any error encountered. -func HPCGetClusterIPsWithLDAP(t *testing.T, options *testhelper.TestOptions, logger *AggregatedLogger) (bastionIP string, managementNodeIPList []string, loginNodeIP, ldapIP string, err error) { - // Retrieve the Terraform directory from the options. - filePath := options.TerraformOptions.TerraformDir - - // Get the bastion server IP and handle errors. - bastionIP, err = GetBastionServerIPFromIni(t, filePath, logger) - if err != nil { - return "", nil, "", "", fmt.Errorf("error getting bastion server IP: %v", err) - } - - // Get the management node IPs and handle errors. - managementNodeIPList, err = GetManagementNodeIPsFromIni(t, filePath, logger) - if err != nil { - return "", nil, "", "", fmt.Errorf("error getting management node IPs: %v", err) - } - - // Get the login node IP and handle errors. - loginNodeIP, err = GetLoginNodeIPFromIni(t, filePath, logger) - if err != nil { - return "", nil, "", "", fmt.Errorf("error getting login node IP: %v", err) - } - - // Get the LDAP server IP and handle errors. - ldapIP, err = GetLdapServerIPFromIni(t, filePath, logger) - if err != nil { - return "", nil, "", "", fmt.Errorf("error getting LDAP server IP: %v", err) - } - - // Return the retrieved IP addresses and any error. - return bastionIP, managementNodeIPList, loginNodeIP, ldapIP, nil -} - // LSFGetClusterIPsWithLDAP retrieves the IP addresses of various servers, including the LDAP server, // from the specified file path in the provided test options, using the provided logger for logging. // It returns the bastion server IP, a list of management node IPs, the login node IP, worker node IPs, @@ -215,10 +267,10 @@ func LSFGetClusterIPsWithLDAP(t *testing.T, options *testhelper.TestOptions, log return "", nil, "", nil, "", fmt.Errorf("failed to get management node IPs: %v", err) } - // Get the login node IP and handle errors. + // Get login node IP and handle errors loginNodeIP, err = GetLoginNodeIPFromIni(t, filePath, logger) if err != nil { - return "", nil, "", nil, "", fmt.Errorf("failed to get login node IP: %v", err) + return "", nil, "", nil, "", fmt.Errorf("failed to get login node IPs: %v", err) } // Get worker node IPs and handle errors. @@ -416,7 +468,8 @@ func RetrieveAndUpdateSecurityGroup(t *testing.T, apiKey, region, resourceGroup, } // Command to get the security group ID based on the cluster prefix. - getSecurityGroupIDCmd := fmt.Sprintf("ibmcloud is security-groups | grep %s-cluster-sg | awk '{print $1}'", clusterPrefix) + //getSecurityGroupIDCmd := fmt.Sprintf("ibmcloud is security-groups | grep %s-cluster-sg | awk '{print $1}'", clusterPrefix) + getSecurityGroupIDCmd := fmt.Sprintf("ibmcloud is security-groups | grep %s-comp-sg | awk '{print $1}'", clusterPrefix) securityGroupIDBytes, err := exec.Command("bash", "-c", getSecurityGroupIDCmd).CombinedOutput() if err != nil { return fmt.Errorf("failed to retrieve security group ID: %w", err) @@ -446,3 +499,17 @@ func RetrieveAndUpdateSecurityGroup(t *testing.T, apiKey, region, resourceGroup, return nil } + +// LSFGetDeployerIP retrieves the deployer node IP address +// from the specified Terraform directory in the test options. +// It logs operations using the provided logger and returns the IP address or an error. +func LSFGetDeployerIP(t *testing.T, options *testhelper.TestOptions, logger *AggregatedLogger) (string, error) { + terraformDir := options.TerraformOptions.TerraformDir + + deployerIP, err := GetDeployerNodeIPFromIni(t, terraformDir, logger) + if err != nil { + return "", fmt.Errorf("error retrieving deployer IP from INI: %w", err) + } + + return deployerIP, nil +} diff --git a/tools/image-builder/main.tf b/tools/image-builder/main.tf index b0c5f677..4b66de01 100644 --- a/tools/image-builder/main.tf +++ b/tools/image-builder/main.tf @@ -1,6 +1,6 @@ module "landing_zone" { source = "terraform-ibm-modules/landing-zone/ibm" - version = "7.4.3" + version = "7.4.4" prefix = local.prefix region = local.region tags = local.tags @@ -48,15 +48,11 @@ resource "ibm_is_subnet_public_gateway_attachment" "zone_1_attachment" { resource "null_resource" "compress_and_encode_folder" { provisioner "local-exec" { command = < ./packer/hpcaas/encoded_compute.txt - fi + # Encode the compressed file to base64 + base64 -i ${path.module}/packer/hpcaas/compressed_compute.tar.gz -o ${path.module}/packer/hpcaas/encoded_compute.txt EOT } } diff --git a/tools/image-builder/template_files.tf b/tools/image-builder/template_files.tf index c02639f4..c112055c 100644 --- a/tools/image-builder/template_files.tf +++ b/tools/image-builder/template_files.tf @@ -19,6 +19,5 @@ data "template_file" "packer_user_data" { zones = join(",", var.zones) existing_resource_group = var.existing_resource_group private_catalog_id = var.private_catalog_id - solution = var.solution } } diff --git a/tools/image-builder/templates/packer_user_data.tpl b/tools/image-builder/templates/packer_user_data.tpl index c2efae84..cd0e8bd0 100644 --- a/tools/image-builder/templates/packer_user_data.tpl +++ b/tools/image-builder/templates/packer_user_data.tpl @@ -66,7 +66,6 @@ mkdir /HPCaaS cd /HPCaaS git clone https://github.com/terraform-ibm-modules/terraform-ibm-hpc.git cd /HPCaaS/terraform-ibm-hpc/solutions/hpc -echo "======================Cloning HPC public repo completed=====================" echo "======================Installing terraform=====================" git clone --depth=1 https://github.com/tfutils/tfenv.git ~/.tfenv @@ -87,14 +86,14 @@ echo "====================== Cos Bucket mounting completed ===================== cd /var/packer/hpcaas/compute sudo -E packer init . && sudo -E packer build \ - -var "ibm_api_key=${ibm_api_key}" \ - -var "vpc_region=${vpc_region}" \ - -var "resource_group_id=${resource_group_id}" \ - -var "vpc_subnet_id=${vpc_subnet_id}" \ - -var "source_image_name=${source_image_name}" \ - -var "install_sysdig=${install_sysdig}" \ - -var "security_group_id=${security_group_id}" \ - -var "image_name=${image_name}" . + -var "ibm_api_key=${ibm_api_key}" \ + -var "vpc_region=${vpc_region}" \ + -var "resource_group_id=${resource_group_id}" \ + -var "vpc_subnet_id=${vpc_subnet_id}" \ + -var "source_image_name=${source_image_name}" \ + -var "install_sysdig=${install_sysdig}" \ + -var "security_group_id=${security_group_id}" \ + -var "image_name=${image_name}" . echo "========== Generating SSH key =========" mkdir -p /HPCaaS/artifacts/.ssh @@ -130,19 +129,12 @@ fi echo "========== Executing Go function to validate the image through HPC deployment =========" export TF_VAR_ibmcloud_api_key=${ibm_api_key} -if [ "${solution}" != "lsf" ]; then - if [ "${private_catalog_id}" ]; then - SOLUTION=${solution} PREFIX=${prefix} CLUSTER_NAME=${cluster_name} RESERVATION_ID=${reservation_id} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY CATALOG_VALIDATE_SSH_KEY=${catalog_validate_ssh_key} ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} PRIVATE_CATALOG_ID=${private_catalog_id} VPC_ID=${vpc_id} SUBNET_ID=${vpc_subnet_id} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log - else - SOLUTION=${solution} PREFIX=${prefix} CLUSTER_NAME=${cluster_name} RESERVATION_ID=${reservation_id} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log - fi +if [ "${private_catalog_id}" ]; then + PREFIX=${prefix} CLUSTER_NAME=${cluster_name} RESERVATION_ID=${reservation_id} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY CATALOG_VALIDATE_SSH_KEY=${catalog_validate_ssh_key} ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} PRIVATE_CATALOG_ID=${private_catalog_id} VPC_ID=${vpc_id} SUBNET_ID=${vpc_subnet_id} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log else - if [ "${private_catalog_id}" ]; then - SOLUTION=${solution} PREFIX=${prefix} CLUSTER_NAME=${cluster_name} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY CATALOG_VALIDATE_SSH_KEY=${catalog_validate_ssh_key} ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} PRIVATE_CATALOG_ID=${private_catalog_id} VPC_ID=${vpc_id} SUBNET_ID=${vpc_subnet_id} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log - else - SOLUTION=${solution} PREFIX=${prefix} CLUSTER_NAME=${cluster_name} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log - fi + PREFIX=${prefix} CLUSTER_NAME=${cluster_name} RESERVATION_ID=${reservation_id} SSH_FILE_PATH="/HPCaaS/artifacts/.ssh/id_rsa" REMOTE_ALLOWED_IPS=$PACKER_FIP SSH_KEYS=$CICD_SSH_KEY ZONES=${zones} EXISTING_RESOURCE_GROUP=${existing_resource_group} COMPUTE_IMAGE_NAME=${image_name} SOURCE_IMAGE_NAME=${source_image_name} go test -v -timeout 900m -parallel 4 -run "TestRunHpcDeploymentForCustomImageBuilder" | tee hpc_log_$(date +%d-%m-%Y-%H-%M-%S).log fi + echo "========== Deleting the SSH key =========" ibmcloud is key-delete $CICD_SSH_KEY -f diff --git a/tools/image-builder/variables.tf b/tools/image-builder/variables.tf index 46063d04..e044ede0 100644 --- a/tools/image-builder/variables.tf +++ b/tools/image-builder/variables.tf @@ -21,7 +21,7 @@ variable "existing_resource_group" { type = string default = "Default" validation { - condition = var.existing_resource_group != null + condition = var.resource_group != null error_message = "If you want to provide null for resource_group variable, it should be within double quotes." } } @@ -201,8 +201,11 @@ variable "cluster_name" { variable "reservation_id" { type = string sensitive = true - default = "" description = "Ensure that you have received the reservation ID from IBM technical sales. Reservation ID is a unique identifier to distinguish different IBM Cloud HPC service agreements. It must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (_)." + validation { + condition = can(regex("^[a-zA-Z][a-zA-Z0-9-_]*$", var.reservation_id)) + error_message = "Reservation ID must start with a letter and can only contain letters, numbers, hyphens (-), or underscores (_)." + } } # tflint-ignore: terraform_unused_declarations @@ -211,9 +214,3 @@ variable "private_catalog_id" { default = "" description = "Provide the private catalog ID if you wish to publish and share the created image to the CE account." } - -variable "solution" { - type = string - default = "lsf" - description = "Provide the value for the solution that is needed for the support of lsf and HPC" -} diff --git a/tools/image-builder/version.tf b/tools/image-builder/version.tf index 4d87918b..0fa51187 100644 --- a/tools/image-builder/version.tf +++ b/tools/image-builder/version.tf @@ -3,7 +3,7 @@ terraform { required_providers { ibm = { source = "IBM-Cloud/ibm" - version = "1.77.0" + version = "1.69.2" } null = { source = "hashicorp/null" diff --git a/variables.tf b/variables.tf new file mode 100644 index 00000000..61014fff --- /dev/null +++ b/variables.tf @@ -0,0 +1,1140 @@ +############################################################################## +# Account Variables +############################################################################## +variable "ibmcloud_api_key" { + type = string + sensitive = true + default = null + description = "IBM Cloud API Key that will be used for authentication in scripts run in this module. Only required if certain options are required." +} + +variable "lsf_version" { + type = string + default = "fixpack_15" + description = "Select the desired version of IBM Spectrum LSF to deploy either fixpack_15 or fixpack_14. By default, the solution uses the latest available version, which is Fix Pack 15. If you need to deploy an earlier version such as Fix Pack 14, update the lsf_version field to fixpack_14. When changing the LSF version, ensure that all custom images used for management, compute, and login nodes correspond to the same version. This is essential to maintain compatibility across the cluster and to prevent deployment issues." +} + +############################################################################## +# Offering Variations +############################################################################## +variable "scheduler" { + type = string + default = null + description = "Select one of the scheduler (LSF/Symphony/Slurm/null)" +} + +variable "ibm_customer_number" { + type = string + sensitive = true + default = null + description = "Comma-separated list of the IBM Customer Number(s) (ICN) that is used for the Bring Your Own License (BYOL) entitlement check. For more information on how to find your ICN, see [What is my IBM Customer Number (ICN)?](https://www.ibm.com/support/pages/what-my-ibm-customer-number-icn)." +} + +############################################################################## +# Cluster Level Variables +############################################################################## +variable "cluster_prefix" { + type = string + default = "lsf" + description = "A unique identifier for resources. Must begin with a letter and end with a letter or number. This cluster_prefix will be prepended to any resources provisioned by this template. Prefixes must be 16 or fewer characters." + validation { + error_message = "cluster_prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.cluster_prefix)) + } + validation { + condition = length(var.cluster_prefix) <= 16 + error_message = "The cluster_prefix must be 16 characters or fewer." + } +} + +variable "zones" { + description = "Specify the IBM Cloud zone within the chosen region where the IBM Spectrum LSF cluster will be deployed. A single zone input is required, and the management nodes, file storage shares, and compute nodes will all be provisioned in this zone.[Learn more](https://cloud.ibm.com/docs/vpc?topic=vpc-creating-a-vpc-in-a-different-region#get-zones-using-the-cli)." + type = list(string) + default = ["us-east-1"] + validation { + condition = length(var.zones) == 1 + error_message = "HPC product deployment supports only a single zone. Provide a value for a single zone from the supported regions: eu-de-2 or eu-de-3 for eu-de, us-east-1 or us-east-3 for us-east, and us-south-1 for us-south." + } +} + +variable "ssh_keys" { + type = list(string) + default = null + description = "The key pair to use to access the HPC cluster." +} + +variable "remote_allowed_ips" { + type = list(string) + description = "Comma-separated list of IP addresses that can access the IBM Spectrum LSF cluster instance through an SSH interface. For security purposes, provide the public IP addresses assigned to the devices that are authorized to establish SSH connections (for example, [\"169.45.117.34\"]). To fetch the IP address of the device, use [https://ipv4.icanhazip.com/](https://ipv4.icanhazip.com/)." + validation { + condition = alltrue([ + for o in var.remote_allowed_ips : !contains(["0.0.0.0/0", "0.0.0.0"], o) + ]) + error_message = "For security, provide the public IP addresses assigned to the devices authorized to establish SSH connections. Use https://ipv4.icanhazip.com/ to fetch the ip address of the device." + } + validation { + condition = alltrue([ + for a in var.remote_allowed_ips : can(regex("^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/(3[0-2]|2[0-9]|1[0-9]|[0-9]))?$", a)) + ]) + error_message = "The provided IP address format is not valid. Check if the IP address contains a comma instead of a dot, and ensure there are double quotation marks between each IP address range if using multiple IP ranges. For multiple IP address, use the format [\"169.45.117.34\",\"128.122.144.145\"]." + } +} + +variable "existing_resource_group" { + type = string + default = "Default" + description = "String describing resource groups to create or reference" +} + +############################################################################## +# VPC Variables +############################################################################## +variable "vpc_name" { + type = string + default = null + description = "Name of an existing VPC in which the cluster resources will be deployed. If no value is given, then a new VPC will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cidr" { + type = string + default = "10.241.0.0/18" + description = "Network CIDR for the VPC. This is used to manage network ACL rules for cluster provisioning." +} + +variable "placement_strategy" { + type = string + default = null + description = "VPC placement groups to create (null / host_spread / power_spread)" +} + +############################################################################## +# Access Variables +############################################################################## +variable "bastion_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "ibm-ubuntu-22-04-3-minimal-amd64-1" + profile = "cx2-4x8" + } + description = "Configuration for the Bastion node, including the image and instance profile. Only Ubuntu stock images are supported." +} + +variable "login_subnet_id" { + type = string + default = null + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cluster_login_private_subnets_cidr_blocks" { + type = string + default = "10.241.16.0/28" + description = "Provide the CIDR block required for the creation of the login cluster's private subnet. Only one CIDR block is needed. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Since the login subnet is used only for the creation of login virtual server instances, provide a CIDR range of /28." + validation { + condition = tonumber(regex("^.*?/(\\d+)$", var.vpc_cluster_login_private_subnets_cidr_blocks)[0]) <= 28 + error_message = "This subnet is used to create only a login virtual server instance. Providing a larger CIDR size will waste the usage of available IPs. A CIDR range of /28 is sufficient for the creation of the login subnet." + } +} + +############################################################################## +# Deployer Variables +############################################################################## +variable "enable_deployer" { + type = bool + default = true + description = "Deployer should be only used for better deployment performance" +} + +variable "deployer_instance" { + type = object({ + image = string + profile = string + }) + default = { + image = "ibm-redhat-8-10-minimal-amd64-4" + profile = "bx2-8x32" + } + description = "Configuration for the deployer node, including the custom image and instance profile. By default, uses fixpack_15 image and a bx2-8x32 profile." +} + +############################################################################## +# Compute Variables +############################################################################## +variable "client_subnets" { + type = list(string) + default = null + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "client_subnets_cidr" { + type = list(string) + default = ["10.241.50.0/24"] + description = "Subnet CIDR block to launch the client host." +} + +variable "client_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." +} + +variable "cluster_subnet_id" { + type = string + default = null + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "vpc_cluster_private_subnets_cidr_blocks" { + type = string + default = "10.241.0.0/20" + description = "Provide the CIDR block required for the creation of the compute cluster's private subnet. One CIDR block is required. If using a hybrid environment, modify the CIDR block to avoid conflicts with any on-premises CIDR blocks. Ensure the selected CIDR block size can accommodate the maximum number of management and dynamic compute nodes expected in your cluster. For more information on CIDR block size selection, refer to the documentation, see [Choosing IP ranges for your VPC](https://cloud.ibm.com/docs/vpc?topic=vpc-choosing-ip-ranges-for-your-vpc)." +} + +variable "management_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for management." +} + +variable "static_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Min Number of instances to be launched for compute cluster." +} + +variable "dynamic_compute_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + count = 500 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "MaxNumber of instances to be launched for compute cluster." +} + +variable "compute_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on compute cluster." +} + +variable "compute_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for compute cluster GUI" +} + +############################################################################## +# Storage Variables +############################################################################## +variable "storage_subnets" { + type = list(string) + default = null + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "storage_subnets_cidr" { + type = list(string) + default = ["10.241.30.0/24"] + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "storage_instances" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "bx2d-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of instances to be launched for storage cluster." +} + +variable "storage_servers" { + type = list( + object({ + profile = string + count = number + image = string + filesystem = string + }) + ) + default = [{ + profile = "cx2d-metal-96x192" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + filesystem = "/ibm/fs1" + }] + description = "Number of BareMetal Servers to be launched for storage cluster." +} + +variable "protocol_subnets" { + type = list(string) + default = null + description = "Name of an existing subnets in which the cluster resources will be deployed. If no value is given, then new subnet(s) will be provisioned for the cluster. [Learn more](https://cloud.ibm.com/docs/vpc)" +} + +variable "protocol_subnets_cidr" { + type = list(string) + default = ["10.241.40.0/24"] + description = "Subnet CIDR block to launch the storage cluster host." +} + +variable "protocol_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for protocol hosts." +} + +variable "colocate_protocol_instances" { + type = bool + default = true + description = "Enable it to use storage instances as protocol instances" +} + +variable "storage_gui_username" { + type = string + default = "admin" + sensitive = true + description = "GUI user to perform system management and monitoring tasks on storage cluster." +} + +variable "storage_gui_password" { + type = string + default = "hpc@IBMCloud" + sensitive = true + description = "Password for storage cluster GUI" +} + +variable "nsd_details" { + type = list( + object({ + profile = string + capacity = optional(number) + iops = optional(number) + }) + ) + default = null + description = "Storage scale NSD details" +} + +variable "storage_security_group_id" { + type = string + default = null + description = "Provide the storage security group ID from the Spectrum Scale storage cluster if the mount_path in the cluster_file_share variable is set to use Scale fileset mount points. This security group is essential for establishing connections between the Spectrum LSF cluster nodes and NFS mount points, ensuring the nodes can access the specified mount points." +} + +variable "custom_file_shares" { + type = list(object({ + mount_path = string, + size = optional(number), + iops = optional(number), + nfs_share = optional(string) + })) + default = [{ mount_path = "/mnt/vpcstorage/tools", size = 100, iops = 2000 }, { mount_path = "/mnt/vpcstorage/data", size = 100, iops = 6000 }, { mount_path = "/mnt/scale/tools", nfs_share = "" }] + description = "Provide details for customizing your shared file storage layout, including mount points, sizes (in GB), and IOPS ranges for up to five file shares if using VPC file storage as the storage option.If using IBM Storage Scale as an NFS mount, update the appropriate mount path and nfs_share values created from the Storage Scale cluster. Note that VPC file storage supports attachment to a maximum of 256 nodes. Exceeding this limit may result in mount point failures due to attachment restrictions.For more information, see [Storage options](https://test.cloud.ibm.com/docs/hpc-ibm-spectrumlsf?topic=hpc-ibm-spectrumlsf-integrating-scale#integrate-scale-and-hpc)." + validation { + condition = length([for item in var.custom_file_shares : item if item.nfs_share == null]) <= 5 + error_message = "The VPC storage custom file share count \"custom_file_shares\" must be less than or equal to 5. Unlimited NFS mounts are allowed." + } + validation { + condition = length([for mounts in var.custom_file_shares : mounts.mount_path]) == length(toset([for mounts in var.custom_file_shares : mounts.mount_path])) + error_message = "Mount path values should not be duplicated." + } + validation { + condition = alltrue([for mounts in var.custom_file_shares : can(mounts.size) && mounts.size != null ? (10 <= mounts.size && mounts.size <= 32000) : true]) + error_message = "The custom_file_share size must be greater than or equal to 10 and less than or equal to 32000." + } +} + +############################################################################## +# DNS Variables +############################################################################## +variable "dns_instance_id" { + type = string + default = null + description = "IBM Cloud HPC DNS service instance id." +} + +variable "dns_custom_resolver_id" { + type = string + default = null + description = "IBM Cloud DNS custom resolver id." +} + +variable "dns_domain_names" { + type = object({ + compute = string + storage = optional(string) + protocol = optional(string) + client = optional(string) + gklm = optional(string) + }) + default = { + compute = "comp.com" + storage = "strg.com" + protocol = "ces.com" + client = "clnt.com" + gklm = "gklm.com" + } + description = "IBM Cloud HPC DNS domain names." +} + +############################################################################## +# Encryption Variables +############################################################################## +variable "key_management" { + type = string + default = null + description = "Set the value as key_protect to enable customer managed encryption for boot volume and file share. If the key_management is set as null, IBM Cloud resources will be always be encrypted through provider managed." + validation { + condition = var.key_management == "null" || var.key_management == null || var.key_management == "key_protect" + error_message = "key_management must be either 'null' or 'key_protect'." + } +} + +variable "kms_instance_name" { + type = string + default = null + description = "Provide the name of the existing Key Protect instance associated with the Key Management Service. Note: To use existing kms_instance_name set key_management as key_protect. The name can be found under the details of the KMS, see [View key-protect ID](https://cloud.ibm.com/docs/key-protect?topic=key-protect-retrieve-instance-ID&interface=ui)." +} + +variable "kms_key_name" { + type = string + default = null + description = "Provide the existing kms key name that you want to use for the IBM Cloud HPC cluster. Note: kms_key_name to be considered only if key_management value is set as key_protect.(for example kms_key_name: my-encryption-key)." +} + +variable "skip_iam_share_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the VPC file share. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for VPC file share](https://cloud.ibm.com/docs/vpc?topic=vpc-file-s2s-auth&interface=ui)." +} + +variable "boot_volume_encryption_key" { + type = string + default = null + description = "The kms_key crn." +} + +variable "existing_kms_instance_guid" { + type = string + default = null + description = "The existing KMS instance guid." +} + +# variable "hpcs_instance_name" { +# type = string +# default = null +# description = "Hyper Protect Crypto Service instance" +# } + +variable "vpn_enabled" { + type = bool + default = false + description = "Set the value as true to deploy a VPN gateway for VPC in the cluster." +} + +variable "skip_flowlogs_s2s_auth_policy" { + type = bool + default = false + description = "Skip auth policy between flow logs service and COS instance, set to true if this policy is already in place on account." +} + +variable "skip_kms_s2s_auth_policy" { + type = bool + default = false + description = "Skip auth policy between KMS service and COS instance, set to true if this policy is already in place on account." +} + +variable "skip_iam_block_storage_authorization_policy" { + type = bool + default = false + description = "When using an existing KMS instance name, set this value to true if authorization is already enabled between KMS instance and the block storage volume. Otherwise, default is set to false. Ensuring proper authorization avoids access issues during deployment.For more information on how to create authorization policy manually, see [creating authorization policies for block storage volume](https://cloud.ibm.com/docs/vpc?topic=vpc-block-s2s-auth&interface=ui)." +} + +############################################################################## +# Observability Variables +############################################################################## +variable "enable_cos_integration" { + type = bool + default = false + description = "Integrate COS with HPC solution" +} + +variable "cos_instance_name" { + type = string + default = null + description = "Exiting COS instance name" +} + +variable "enable_vpc_flow_logs" { + type = bool + default = false + description = "Enable Activity tracker" +} + +############################################################################## +# Scale specific Variables +############################################################################## +variable "filesystem_config" { + type = list( + object({ + filesystem = string + block_size = string + default_data_replica = number + default_metadata_replica = number + max_data_replica = number + max_metadata_replica = number + mount_point = string + }) + ) + default = null + description = "File system configurations." +} + +# variable "filesets_config" { +# type = list( +# object({ +# fileset = string +# filesystem = string +# junction_path = string +# client_mount_path = string +# quota = number +# }) +# ) +# default = null +# description = "Fileset configurations." +# } + +variable "afm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 0 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for afm hosts." +} + +variable "afm_cos_config" { + type = list( + object({ + afm_fileset = string, + mode = string, + cos_instance = string, + bucket_name = string, + bucket_region = string, + cos_service_cred_key = string, + bucket_type = string, + bucket_storage_class = string + }) + ) + default = [{ + afm_fileset = "afm_fileset" + mode = "iw" + cos_instance = "" + bucket_name = "" + bucket_region = "us-south" + cos_service_cred_key = "" + bucket_storage_class = "smart" + bucket_type = "region_location" + }] + # default = [{ + # afm_fileset = "afm_fileset" + # mode = "iw" + # cos_instance = null + # bucket_name = null + # bucket_region = "us-south" + # cos_service_cred_key = "" + # bucket_storage_class = "smart" + # bucket_type = "region_location" + # }] + description = "AFM configurations." +} + +############################################################################## +# LSF specific Variables +############################################################################## +# variable "cluster_id" { +# type = string +# default = "HPCCluster" +# description = "Unique ID of the cluster used by LSF for configuration of resources. This can be up to 39 alphanumeric characters." +# validation { +# condition = 0 < length(var.cluster_id) && length(var.cluster_id) < 40 && can(regex("^[a-zA-Z0-9_.-]+$", var.cluster_id)) +# error_message = "The ID can be up to 39 alphanumeric characters including the underscore (_), the hyphen (-), and the period (.) characters." +# } +# } + +variable "enable_hyperthreading" { + type = bool + default = true + description = "Setting this to true will enable hyper-threading in the worker nodes of the cluster (default). Otherwise, hyper-threading will be disabled." +} + +# variable "enable_dedicated_host" { +# type = bool +# default = false +# description = "Set to true to use dedicated hosts for compute hosts (default: false)." +# } + +# variable "dedicated_host_placement" { +# type = string +# default = "spread" +# description = "Specify 'pack' or 'spread'. The 'pack' option will deploy VSIs on one dedicated host until full before moving on to the next dedicated host." +# validation { +# condition = var.dedicated_host_placement == "spread" || var.dedicated_host_placement == "pack" +# error_message = "Supported values for dedicated_host_placement: spread or pack." +# } +# } + +variable "app_center_gui_password" { + type = string + default = "" + sensitive = true + description = "Password for IBM Spectrum LSF Application Center GUI." +} + +############################################################################## +# Symphony specific Variables +############################################################################## + +############################################################################## +# Slurm specific Variables +############################################################################## + +############################################################################## +# Observability Variables +############################################################################## + +variable "observability_atracker_enable" { + type = bool + default = true + description = "Activity Tracker Event Routing to configure how to route auditing events. While multiple Activity Tracker instances can be created, only one tracker is needed to capture all events. Creating additional trackers is unnecessary if an existing Activity Tracker is already integrated with a COS bucket. In such cases, set the value to false, as all events can be monitored and accessed through the existing Activity Tracker." +} + +variable "observability_atracker_target_type" { + type = string + default = "cloudlogs" + description = "All the events will be stored in either COS bucket or Cloud Logs on the basis of user input, so customers can retrieve or ingest them in their system." + validation { + condition = contains(["cloudlogs", "cos"], var.observability_atracker_target_type) + error_message = "Allowed values for atracker target type is cloudlogs and cos." + } +} + +variable "observability_monitoring_enable" { + description = "Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure and LSF application metrics from Management Nodes will be ingested." + type = bool + default = true +} + +variable "observability_logs_enable_for_management" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Management Nodes will be ingested." + type = bool + default = false +} + +variable "observability_logs_enable_for_compute" { + description = "Set false to disable IBM Cloud Logs integration. If enabled, infrastructure and LSF application logs from Compute Nodes will be ingested." + type = bool + default = false +} + +variable "observability_enable_platform_logs" { + description = "Setting this to true will create a tenant in the same region that the Cloud Logs instance is provisioned to enable platform logs for that region. NOTE: You can only have 1 tenant per region in an account." + type = bool + default = false +} + +variable "observability_enable_metrics_routing" { + description = "Enable metrics routing to manage metrics at the account-level by configuring targets and routes that define where data points are routed." + type = bool + default = false +} + +variable "observability_logs_retention_period" { + description = "The number of days IBM Cloud Logs will retain the logs data in Priority insights. Allowed values: 7, 14, 30, 60, 90." + type = number + default = 7 + validation { + condition = contains([7, 14, 30, 60, 90], var.observability_logs_retention_period) + error_message = "Allowed values for cloud logs retention period is 7, 14, 30, 60, 90." + } +} + +variable "observability_monitoring_on_compute_nodes_enable" { + description = "Set false to disable IBM Cloud Monitoring integration. If enabled, infrastructure metrics from Compute Nodes will be ingested." + type = bool + default = false +} + +variable "observability_monitoring_plan" { + description = "Type of service plan for IBM Cloud Monitoring instance. You can choose one of the following: lite, graduated-tier. For all details visit [IBM Cloud Monitoring Service Plans](https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans)." + type = string + default = "graduated-tier" + validation { + condition = can(regex("lite|graduated-tier", var.observability_monitoring_plan)) + error_message = "Please enter a valid plan for IBM Cloud Monitoring, for all details visit https://cloud.ibm.com/docs/monitoring?topic=monitoring-service_plans." + } +} + +variable "enable_landing_zone" { + type = bool + default = true + description = "Run landing zone module." +} + +variable "enable_atracker" { + type = bool + default = false + description = "Enable Activity tracker" +} + +variable "bastion_security_group_id" { + type = string + default = null + description = "bastion security group id" +} + +variable "deployer_hostname" { + type = string + default = null + description = "deployer node hostname" +} + +variable "deployer_ip" { + type = string + default = null + description = "deployer node ip" +} + +variable "cloud_logs_data_bucket" { + type = any + default = null + description = "cloud logs data bucket" +} + +variable "cloud_metrics_data_bucket" { + type = any + default = null + description = "cloud metrics data bucket" +} + +# variable "scc_cos_bucket" { +# type = string +# default = null +# description = "scc cos bucket" +# } + +# variable "scc_cos_instance_crn" { +# type = string +# default = null +# description = "scc cos instance crn" +# } + +############################################################################# +# VARIABLES TO BE CHECKED +############################################################################## + + + + + + + + +############################################################################# +# LDAP variables +############################################################################## +variable "enable_ldap" { + type = bool + default = false + description = "Set this option to true to enable LDAP for IBM Cloud HPC, with the default value set to false." +} + +variable "ldap_basedns" { + type = string + default = "ldapscale.com" + description = "The dns domain name is used for configuring the LDAP server. If an LDAP server is already in existence, ensure to provide the associated DNS domain name." +} + +variable "ldap_server" { + type = string + default = "" + description = "Provide the IP address for the existing LDAP server. If no address is given, a new LDAP server will be created." +} + +variable "ldap_server_cert" { + type = string + sensitive = true + default = "" + description = "Provide the existing LDAP server certificate. This value is required if the 'ldap_server' variable is not set to null. If the certificate is not provided or is invalid, the LDAP configuration may fail." +} + +variable "ldap_admin_password" { + type = string + sensitive = true + default = null + description = "The LDAP administrative password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required. It is important to avoid including the username in the password for enhanced security." +} + +variable "ldap_user_name" { + type = string + default = "" + description = "Custom LDAP User for performing cluster operations. Note: Username should be between 4 to 32 characters, (any combination of lowercase and uppercase letters).[This value is ignored for an existing LDAP server]" +} + +variable "ldap_user_password" { + type = string + sensitive = true + default = "" + description = "The LDAP user password should be 8 to 20 characters long, with a mix of at least three alphabetic characters, including one uppercase and one lowercase letter. It must also include two numerical digits and at least one special character from (~@_+:) are required.It is important to avoid including the username in the password for enhanced security.[This value is ignored for an existing LDAP server]." +} + +variable "ldap_instance_key_pair" { + type = list(string) + default = null + description = "Name of the SSH key configured in your IBM Cloud account that is used to establish a connection to the LDAP Server. Make sure that the SSH key is present in the same resource group and region where the LDAP Servers are provisioned. If you do not have an SSH key in your IBM Cloud account, create one by using the [SSH keys](https://cloud.ibm.com/docs/vpc?topic=vpc-ssh-keys) instructions." +} + +variable "ldap_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "cx2-2x4" + image = "ibm-ubuntu-22-04-5-minimal-amd64-1" + }] + description = "Profile and Image name to be used for provisioning the LDAP instances. Note: Debian based OS are only supported for the LDAP feature" +} + +############################################################################## +# GKLM variables +############################################################################## +variable "scale_encryption_enabled" { + type = bool + default = false + description = "To enable the encryption for the filesystem. Select true or false" +} + +variable "scale_encryption_type" { + type = string + default = null + description = "To enable filesystem encryption, specify either 'key_protect' or 'gklm'. If neither is specified, the default value will be 'null' and encryption is disabled" +} + +variable "gklm_instance_key_pair" { + type = list(string) + default = null + description = "The key pair to use to launch the GKLM host." +} + +variable "gklm_instances" { + type = list( + object({ + profile = string + count = number + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + count = 2 + image = "ibm-redhat-8-10-minimal-amd64-4" + }] + description = "Number of instances to be launched for client." +} + +# variable "scale_encryption_admin_default_password" { +# type = string +# default = null +# description = "The default administrator password used for resetting the admin password based on the user input. The password has to be updated which was configured during the GKLM installation." +# } + +# variable "scale_encryption_admin_username" { +# type = string +# default = null +# description = "The default Admin username for Security Key Lifecycle Manager(GKLM)." +# } + +variable "scale_encryption_admin_password" { + type = string + default = null + description = "Password that is used for performing administrative operations for the GKLM.The password must contain at least 8 characters and at most 20 characters. For a strong password, at least three alphabetic characters are required, with at least one uppercase and one lowercase letter. Two numbers, and at least one special character from this(~@_+:). Make sure that the password doesn't include the username. Visit this [page](https://www.ibm.com/docs/en/gklm/3.0.1?topic=roles-password-policy) to know more about password policy of GKLM. " +} + +variable "scale_ansible_repo_clone_path" { + type = string + default = "/opt/ibm/ibm-spectrumscale-cloud-deploy" + description = "Path to clone github.com/IBM/ibm-spectrum-scale-install-infra." +} + +variable "spectrumscale_rpms_path" { + type = string + default = "/opt/ibm/gpfs_cloud_rpms" + description = "Path that contains IBM Spectrum Scale product cloud rpms." +} + +variable "storage_type" { + type = string + default = "scratch" + description = "Select the required storage type(scratch/persistent/eval)." +} + +variable "using_packer_image" { + type = bool + default = false + description = "If true, gpfs rpm copy step will be skipped during the configuration." +} + +variable "using_jumphost_connection" { + type = bool + default = false + description = "If true, will skip the jump/bastion host configuration." +} + +variable "bastion_user" { + type = string + default = "ubuntu" + description = "Provide the username for Bastion login." +} + +variable "inventory_format" { + type = string + default = "ini" + description = "Specify inventory format suited for ansible playbooks." +} + +variable "bastion_instance_id" { + type = string + default = null + description = "Bastion instance id." +} + +variable "bastion_ssh_private_key" { + type = string + default = "None" + description = "Bastion SSH private key path, which will be used to login to bastion host." +} + +variable "create_separate_namespaces" { + type = bool + default = true + description = "Flag to select if separate namespace needs to be created for compute instances." +} + +variable "create_scale_cluster" { + type = bool + default = true + description = "Flag to represent whether to create scale cluster or not." +} + +variable "using_rest_api_remote_mount" { + type = string + default = true + description = "If false, skips GUI initialization on compute cluster for remote mount configuration." +} + +variable "bastion_fip" { + type = string + default = null + description = "bastion fip" +} + +variable "scale_compute_cluster_filesystem_mountpoint" { + type = string + default = "/gpfs/fs1" + description = "Compute cluster (accessingCluster) Filesystem mount point." +} +############################################################################## +# Dedicatedhost Variables +############################################################################## + +variable "enable_dedicated_host" { + type = bool + default = false + description = "Enables dedicated host to the compute instances" +} + +########################################################################### +# Existing Bastion Support variables +########################################################################### + +variable "existing_bastion_instance_name" { + type = string + default = null + description = "Provide the name of the bastion instance. If none given then new bastion will be created." +} + +variable "existing_bastion_instance_public_ip" { + type = string + default = null + description = "Provide the public ip address of the bastion instance to establish the remote connection." +} + +variable "existing_bastion_security_group_id" { + type = string + default = null + description = "Specify the security group ID for the bastion server. This ID will be added as an allowlist rule on the HPC cluster nodes to facilitate secure SSH connections through the bastion node. By restricting access through a bastion server, this setup enhances security by controlling and monitoring entry points into the cluster environment. Ensure that the specified security group is correctly configured to permit only authorized traffic for secure and efficient management of cluster resources." +} + +variable "existing_bastion_ssh_private_key" { + type = string + sensitive = true + default = null + description = "Provide the private SSH key (named id_rsa) used during the creation and configuration of the bastion server to securely authenticate and connect to the bastion server. This allows access to internal network resources from a secure entry point. Note: The corresponding public SSH key (named id_rsa.pub) must already be available in the ~/.ssh/authorized_keys file on the bastion host to establish authentication." +} + +variable "resource_group_ids" { + type = any + default = null + description = "Map describing resource groups to create or reference" +} +############################################################################## +# Login Variables +############################################################################## +variable "login_instance" { + type = list( + object({ + profile = string + image = string + }) + ) + default = [{ + profile = "bx2-2x8" + image = "hpcaas-lsf10-rhel810-compute-v8" + }] + description = "Number of instances to be launched for login node." +} + +############################################################################## +# Environment Variables +############################################################################## + +# tflint-ignore: all +variable "TF_VERSION" { + type = string + default = "1.9" + description = "The version of the Terraform engine that's used in the Schematics workspace." +} + +# tflint-ignore: all +variable "TF_PARALLELISM" { + type = string + default = "250" + description = "Parallelism/ concurrent operations limit. Valid values are between 1 and 256, both inclusive. [Learn more](https://www.terraform.io/docs/internals/graph.html#walking-the-graph)." + validation { + condition = 1 <= var.TF_PARALLELISM && var.TF_PARALLELISM <= 256 + error_message = "Input \"TF_PARALLELISM\" must be greater than or equal to 1 and less than or equal to 256." + } +} + +############################################################################## +# SCC Variables +############################################################################## + +variable "sccwp_service_plan" { + description = "IBM service pricing plan." + type = string + default = "free-trial" + validation { + error_message = "Plan for SCC Workload Protection instances can only be `free-trial` or `graduated-tier`." + condition = contains( + ["free-trial", "graduated-tier"], + var.sccwp_service_plan + ) + } +} + +variable "sccwp_enable" { + type = bool + default = true + description = "Flag to enable SCC instance creation. If true, an instance of SCC (Security and Compliance Center) will be created." +} + +variable "cspm_enabled" { + description = "Enable Cloud Security Posture Management (CSPM) for the Workload Protection instance. This will create a trusted profile associated with the SCC Workload Protection instance that has viewer / reader access to the App Config service and viewer access to the Enterprise service. [Learn more](https://cloud.ibm.com/docs/workload-protection?topic=workload-protection-about)." + type = bool + default = false + nullable = false +} + +variable "app_config_plan" { + description = "Specify the IBM service pricing plan for the app configuration. Allowed values are 'basic', 'lite', 'standardv2', 'enterprise'." + type = string + default = "basic" + validation { + error_message = "Plan for App configuration can only be basic, lite, standardv2, enterprise.." + condition = contains( + ["basic", "lite", "standardv2", "enterprise"], + var.app_config_plan + ) + } +} diff --git a/version.tf b/version.tf new file mode 100644 index 00000000..7615eacf --- /dev/null +++ b/version.tf @@ -0,0 +1,19 @@ +############################################################################## +# Terraform Providers +############################################################################## + +terraform { + required_version = ">= 1.9.0" + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.68.1, < 2.0.0" + } + time = { + source = "hashicorp/time" + version = ">= 0.9.1, < 1.0.0" + } + } +} + +##############################################################################