diff --git a/Makefile b/Makefile index 3e90d55cfb..4e6218a628 100644 --- a/Makefile +++ b/Makefile @@ -68,11 +68,10 @@ install-dev-deps: warn-terraform-version warn-packer-version check-pre-commit ch go install github.com/fzipp/gocyclo/cmd/gocyclo@latest go install github.com/go-critic/go-critic/cmd/gocritic@latest go install github.com/google/addlicense@latest - go install mvdan.cc/sh/v3/cmd/shfmt@latest - go install golang.org/x/tools/cmd/goimports@latest + go install mvdan.cc/sh/v3/cmd/shfmt@v3.12.0 + go install golang.org/x/tools/cmd/goimports@v0.42.0 go install honnef.co/go/tools/cmd/staticcheck@latest go install github.com/jstemmer/go-junit-report/v2@latest - pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements-dev.txt pip install mypy==1.18.2 diff --git a/README.md b/README.md index 65c6a3b432..44b9683ad2 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ is the recommended path to get started with the Cluster Toolkit. ### Using the Pre-built Bundle (Recommended) -For the easiest setup, download the `gcluster_bundle_linux.zip` (for Linux) or `gcluster_bundle_mac.zip` (for macOS) from the [Releases](https://github.com/GoogleCloudPlatform/cluster-toolkit/releases) page. These bundles include the pre-compiled `gcluster` binary, the `examples` folder, and the `community/examples` folder. +For the easiest setup, download the appropriate bundle for your operating system and architecture (e.g., gcluster_bundle_linux_amd64.zip, gcluster_bundle_linux_arm64.zip, gcluster_bundle_mac_amd64.zip, or gcluster_bundle_mac_arm64.zip) from the [Releases](https://github.com/GoogleCloudPlatform/cluster-toolkit/releases) page. These bundles include the pre-compiled `gcluster` binary, the `examples` folder, and the `community/examples` folder. #### Bundle Compatibility Matrix @@ -51,25 +51,45 @@ The pre-built bundles are compiled for Linux and macOS execution environments an | Platform | Support Status | Notes | | :--- | :---: | :--- | -| **Linux** | ✅ | Pre-compiled on Debian Bullseye. | -| **Google Cloud Shell** | ✅ | Native support via the Linux binary. | -| **macOS** | ✅ | Native support via the Mac binary. | +| **Linux (amd64 / arm64)** | ✅ | Pre-compiled on Debian Bullseye. Includes amd64 (x86_64) and arm64 builds starting v1.85.0. | +| **Google Cloud Shell** | ✅ | Native support via the Linux amd64 binary. | +| **macOS (amd64 / arm64)** | ✅ | Native support via the Mac binary. Includes amd64 (Intel) and arm64 (Apple Silicon) builds starting v1.85.0. | | **Windows** | ❎ | Please [Build from Source](#building-from-source). | 1. Download and extract the bundle: + > **_NOTE:_** The binary is available starting with version 1.82.0 [Only supports x86/amd64 arch]. Multi-architecture builds (amd64 and arm64) are available starting with version 1.85.0. + + For versions v1.85.0 and newer (Multi-architecture): + ```shell # Find all available releases at: https://github.com/GoogleCloudPlatform/cluster-toolkit/releases - # Set the desired version TAG (e.g., v1.82.0) + # Set the desired version TAG (e.g., v1.85.0) TAG=vX.Y.Z - # Replace gcluster-bundle.zip with the platform-specific filename (e.g., gcluster_bundle_linux.zip) - curl -LO https://github.com/GoogleCloudPlatform/cluster-toolkit/releases/download/${TAG}/gcluster-bundle.zip - unzip gcluster-bundle.zip -d gcluster-bundle/ + # Set your OS (linux or mac) and Architecture (amd64 or arm64) + OS="linux" + ARCH="amd64" + # Download and extract the platform-specific bundle + curl -LO https://github.com/GoogleCloudPlatform/cluster-toolkit/releases/download/${TAG}/gcluster_bundle_${OS}_${ARCH}.zip + unzip gcluster_bundle_${OS}_${ARCH}.zip -d gcluster-bundle/ cd gcluster-bundle chmod +x gcluster ``` - > **_NOTE:_** The binary is available starting with version 1.82.0 + For versions v1.82.0 through v1.84.0: + + ```shell + # Find all available releases at: https://github.com/GoogleCloudPlatform/cluster-toolkit/releases + # Set the desired version TAG (e.g., v1.84.0) + TAG=vX.Y.Z + # Set your OS (linux or mac) + OS="linux" + # Download and extract + curl -LO https://github.com/GoogleCloudPlatform/cluster-toolkit/releases/download/${TAG}/gcluster_bundle_${OS}.zip + unzip gcluster_bundle_${OS}.zip -d gcluster-bundle/ + cd gcluster-bundle + chmod +x gcluster + ``` 2. Verify the Installation: @@ -331,6 +351,13 @@ hpc-slurm/ See [Cloud Docs on Installing Dependencies](https://cloud.google.com/cluster-toolkit/docs/setup/install-dependencies). +When running commands like `deploy`, `destroy`, or `export-outputs`, the toolkit can automatically download missing dependencies. You can control this behavior using the `--download-dependencies` flag: + +* `--download-dependencies`: Automatically download missing dependencies without prompting. +* `--download-dependencies=false`: Fail immediately if any required dependencies are missing. + +If the flag is not provided, you will be interactively asked to confirm the download. + ### Notes on Packer The Toolkit supports Packer templates in the contemporary [HCL2 file diff --git a/cmd/dependencies.go b/cmd/dependencies.go new file mode 100644 index 0000000000..0d15394304 --- /dev/null +++ b/cmd/dependencies.go @@ -0,0 +1,54 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "hpc-toolkit/pkg/dependencies" + "hpc-toolkit/pkg/logging" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" +) + +var downloadDependencies bool +var ensureDependenciesFn = dependencies.EnsureDependencies + +func addDependenciesFlags(flagset *pflag.FlagSet) { + flagset.BoolVar(&downloadDependencies, "download-dependencies", false, "Automatically download missing dependencies. Pass --download-dependencies=false to fail if missing.") +} + +func initDependencies(cmd *cobra.Command) { + allowedCmds := map[string]bool{ + "deploy": true, + "destroy": true, + "export-outputs": true, + } + if !allowedCmds[cmd.Name()] { + return + } + + decision := dependencies.DownloadDecisionAsk + if cmd.Flags().Changed("download-dependencies") { + if downloadDependencies { + decision = dependencies.DownloadDecisionYes + } else { + decision = dependencies.DownloadDecisionNo + } + } + + if err := ensureDependenciesFn(decision); err != nil { + logging.Fatal("Failed to setup dependencies: %v", err) + } +} diff --git a/cmd/dependencies_test.go b/cmd/dependencies_test.go new file mode 100644 index 0000000000..c59d55f99b --- /dev/null +++ b/cmd/dependencies_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2026 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "testing" + + "hpc-toolkit/pkg/dependencies" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" +) + +func TestAddDependenciesFlags(t *testing.T) { + flags := pflag.NewFlagSet("test", pflag.ContinueOnError) + addDependenciesFlags(flags) + + flag := flags.Lookup("download-dependencies") + if flag == nil { + t.Fatalf("Expected 'download-dependencies' flag to be added, but it was not") + } + + if flag.DefValue != "false" { + t.Errorf("Expected default value to be 'false', got '%s'", flag.DefValue) + } +} + +func TestInitDependenciesIgnoresCommands(t *testing.T) { + cmd := &cobra.Command{Use: "unrelated-command"} + flags := pflag.NewFlagSet("test", pflag.ContinueOnError) + addDependenciesFlags(flags) + cmd.Flags().AddFlagSet(flags) + + called := false + originalFn := ensureDependenciesFn + ensureDependenciesFn = func(d dependencies.DownloadDecision) error { + called = true + return nil + } + defer func() { ensureDependenciesFn = originalFn }() + + initDependencies(cmd) + + if called { + t.Errorf("Expected ensureDependenciesFn not to be called") + } +} + +func TestInitDependenciesAllowedCommands(t *testing.T) { + for _, cmdName := range []string{"deploy", "destroy", "export-outputs"} { + cmd := &cobra.Command{Use: cmdName} + flags := pflag.NewFlagSet("test", pflag.ContinueOnError) + addDependenciesFlags(flags) + cmd.Flags().AddFlagSet(flags) + + called := false + originalFn := ensureDependenciesFn + ensureDependenciesFn = func(d dependencies.DownloadDecision) error { + called = true + return nil + } + defer func() { ensureDependenciesFn = originalFn }() + + initDependencies(cmd) + + if !called { + t.Errorf("Expected ensureDependenciesFn to be called for command %s", cmdName) + } + } +} diff --git a/cmd/deploy.go b/cmd/deploy.go index b45c7aae5b..368be642da 100644 --- a/cmd/deploy.go +++ b/cmd/deploy.go @@ -60,7 +60,7 @@ func runDeployCmd(cmd *cobra.Command, args []string) { deplRoot = args[0] // check that no "create" flags were specified cmd.Flags().VisitAll(func(f *pflag.Flag) { - if f.Changed && createCmd.Flag(f.Name) != nil { + if f.Changed && createCmd.LocalFlags().Lookup(f.Name) != nil { checkErr(fmt.Errorf("cannot specify flag %q with DEPLOYMENT_DIRECTORY provided", f.Name), nil) } }) diff --git a/cmd/root.go b/cmd/root.go index 62230fb5cf..2fc312c7b7 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,15 +52,17 @@ var ( logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.84.0", + Version: "v1.85.0", Annotations: annotation, } ) func init() { + addDependenciesFlags(rootCmd.PersistentFlags()) addColorFlag(rootCmd.PersistentFlags()) rootCmd.PersistentPreRun = func(cmd *cobra.Command, args []string) { initColor() + initDependencies(cmd) } } diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index e07633b0e1..02f98fba70 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -1,18 +1,18 @@ altgraph==0.17.5 archspec==0.2.5 argcomplete==3.6.3 -asgiref==3.11.0 -astroid==4.0.3 +asgiref==3.11.1 +astroid>=4.0.2,<4.1.0 attrs==25.4.0 # This should be supported by zoneinfo in Python 3.9+ backports.zoneinfo==0.2.1;python_version<"3.9" -cachetools>=2.0,<7.0 -certifi==2026.1.4 +cachetools>=2.0,<8.0 +certifi==2026.2.25 cffi>=2.0.0 cfgv==3.5.0 charset-normalizer==3.4.4 click==8.3.1 -crispy-bootstrap5==2025.6 +crispy-bootstrap5==2026.3 cryptography==46.0.5 decorator==5.2.1 defusedxml==0.7.1 @@ -20,32 +20,32 @@ dill==0.4.1 distlib==0.4.0 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==6.0.2 -django-allauth==65.14.1 -django-crispy-forms==2.5 +Django==6.0.3 +django-allauth==65.14.3 +django-crispy-forms==2.6 django-extensions==3.2.3 djangorestframework==3.16.1 -filelock==3.20.3 -google-api-core==2.29.0 -google-api-python-client==2.188.0 +filelock==3.24.3 +google-api-core==2.30.0 +google-api-python-client==2.190.0 google-auth==2.48.0 google-auth-httplib2==0.3.0 -google-cloud-artifact-registry==1.19.0 +google-cloud-artifact-registry==1.20.0 google-cloud-secret-manager==2.26.0 google-cloud-build==3.35.0 google-cloud-billing==1.18.0 google-cloud-core==2.5.0 -google-cloud-pubsub==2.34.0 -google-cloud-compute==1.43.0 -google-cloud-storage==3.8.0 +google-cloud-pubsub==2.35.0 +google-cloud-compute==1.44.0 +google-cloud-storage==3.9.0 google-cloud-secret-manager==2.26.0 google-crc32c==1.8.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 grafana-api==1.0.3 grpc-google-iam-v1==0.14.3 -grpcio==1.76.0 -grpcio-status==1.76.0 +grpcio==1.78.0 +grpcio-status==1.78.0 h11==0.16.0 httplib2==0.31.2 identify==2.6.16 @@ -65,15 +65,15 @@ nodeenv==1.10.0 oauthlib==3.3.1 path==17.1.1 pkgutil_resolve_name==1.3.10 -platformdirs==4.5.1 +platformdirs==4.9.2 pre-commit==4.5.1 -proto-plus==1.27.0 +proto-plus==1.27.1 protobuf>=6.31.1,<7.0.0 -pyasn1==0.6.2 +pyasn1==0.6.3 pyasn1-modules==0.4.2 pycparser==3.0 -PyJWT==2.11.0 -pylint==4.0.4 +PyJWT==2.12.0 +pylint==4.0.5 pylint-django==2.7.0 pylint-plugin-utils==0.9.0 pyparsing==3.3.2 @@ -99,9 +99,9 @@ typing-inspect==0.9.0 typing_extensions==4.15.0 uritemplate==4.2.0 urllib3==2.6.3 -uvicorn==0.40.0 -virtualenv==20.36.1 -wrapt==2.1.0 -xmltodict==1.0.2 +uvicorn==0.41.0 +virtualenv==21.1.0 +wrapt==2.1.1 +xmltodict==1.0.4 yq==3.4.3 zipp==3.23.0 diff --git a/community/modules/compute/gke-nodeset/versions.tf b/community/modules/compute/gke-nodeset/versions.tf index f4543b316b..60bc871ff0 100644 --- a/community/modules/compute/gke-nodeset/versions.tf +++ b/community/modules/compute/gke-nodeset/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-nodeset/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-nodeset/v1.85.0" } } diff --git a/community/modules/compute/gke-partition/versions.tf b/community/modules/compute/gke-partition/versions.tf index 5ca2e04f6f..38e15b3258 100644 --- a/community/modules/compute/gke-partition/versions.tf +++ b/community/modules/compute/gke-partition/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-partition/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-partition/v1.85.0" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index bcb5c6766e..bc6446b701 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -29,6 +29,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.85.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index 9dd63061c0..b18f35f951 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.85.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 824c9973a5..c52c06815c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -17,6 +17,6 @@ terraform { required_version = "= 1.12.2" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.85.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index be7f85e27d..f36f2dff7a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = "= 1.12.2" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.85.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 946d1569b2..95a6ff9147 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.85.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 67b87e46d1..2394917501 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = "= 1.12.2" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.85.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 6e9389eeeb..fee0c64080 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index c9f44edf01..48c758d847 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 456e1f48cb..777de9e9f4 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.85.0" } } diff --git a/community/modules/internal/slurm-gcp/instance_template/main.tf b/community/modules/internal/slurm-gcp/instance_template/main.tf index eaa13f6017..12bf39c886 100644 --- a/community/modules/internal/slurm-gcp/instance_template/main.tf +++ b/community/modules/internal/slurm-gcp/instance_template/main.tf @@ -140,7 +140,7 @@ module "instance_template" { metadata = merge( var.metadata, { - enable-oslogin = upper(var.enable_oslogin) + enable-oslogin = var.enable_oslogin ? "TRUE" : "FALSE" slurm_bucket_path = var.slurm_bucket_path slurm_cluster_name = var.slurm_cluster_name slurm_instance_role = var.slurm_instance_role diff --git a/community/modules/internal/slurm-gcp/login/versions.tf b/community/modules/internal/slurm-gcp/login/versions.tf index fb9a9a1345..11ab61a845 100644 --- a/community/modules/internal/slurm-gcp/login/versions.tf +++ b/community/modules/internal/slurm-gcp/login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.85.0" } } diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index f540585c41..bb69fa4c4f 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index e812fd4f23..e5683c9427 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.85.0" } required_version = "= 1.12.2" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 6354deb064..c1c8f4d028 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.85.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 943b6560e6..595d59fa20 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index c0de7143a9..62d410fba2 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 186b3f9b74..2364d7cfb3 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index c6a967c1a3..b401725578 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -171,7 +171,7 @@ resource "google_storage_bucket_object" "nodeset_tpu_config" { ######### locals { - build_dir = abspath("${path.module}/build") + build_dir = "${path.module}/build" slurm_gcp_devel_controller_zip = "slurm-gcp-devel-controller.zip" slurm_gcp_devel_compute_zip = "slurm-gcp-devel.zip" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 70a9fa0d54..f47b49d793 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -493,13 +493,14 @@ def render_yaml_switches(self): class Block: name: str nodes: List[str] = field(default_factory=list) + block_uuid: str = field(default_factory=lambda: uuid.uuid4().hex[:7]) def render_yaml_block(self) -> Optional[Dict]: if not self.nodes: return None # uuid used in the unlikely event two unrelated NVLDs have the same hash return { - "block": f"{self.name[:10]}-{uuid.uuid4().hex[:5]}", + "block": f"{self.name[:10]}-{self.block_uuid}", "nodes": util.to_hostlist(self.nodes) } @@ -556,10 +557,17 @@ def _nodenames(self) -> Set[str]: return set(self.physical_host) | self.down_nodes | self.tpu_nodes def requires_reconfigure(self, prev: "TopologySummary") -> bool: - """ + """Determines if the topology requires a reconfiguration. + Reconfigure IFF one of the following occurs: * A node is added * A node get a non-empty physicalHost + + Args: + prev: The previous TopologySummary to compare against. + + Returns: + True if a reconfiguration is required, False otherwise. """ if len(self._nodenames() - prev._nodenames()) > 0: return True @@ -569,6 +577,8 @@ def requires_reconfigure(self, prev: "TopologySummary") -> bool: return False class TopologyBuilder: + """Builds the Slurm topology configuration from node information.""" + def __init__(self) -> None: self._r = Switch("") # fake root for switches, not part of the tree self._b: defaultdict[str, Dict[str, Block]] = defaultdict(dict) @@ -594,6 +604,8 @@ def add(self, path: List[str], nodes: Iterable[str]) -> None: b[_SLURM_TOPO_ROOT].setdefault(_SLURM_TOPO_ROOT, Block(_SLURM_TOPO_ROOT)).nodes.extend(nodes) def render_blocks(self) -> list[Any]: + """Renders the blocks/phantom block in yaml format.""" + blocks = [] for block_group_name, nvld_blocks in self._b.items(): num_blocks = len(nvld_blocks.values()) @@ -632,6 +644,14 @@ def render_yaml(self, block_is_default: bool=False) -> List[Any]: return sections def compress(self) -> "TopologyBuilder": + """Compresses the topology tree by renaming switches. + This method creates a new TopologyBuilder with a compressed representation + of the switch hierarchy, using shorter, generated names for switches + in order to prevent hitting a character limit for the topology file. + + Returns: + A new TopologyBuilder instance with the compressed topology. + """ compressed = TopologyBuilder() compressed.summary = self.summary def _walk( @@ -717,8 +737,15 @@ def gen_topology(lkp: util.Lookup) -> TopologyBuilder: def gen_topology_yaml(lkp: util.Lookup) -> Tuple[bool, TopologySummary]: """ Generates slurm topology.yaml. - Returns whether the topology.yaml got updated. + Args: + lkp: The Lookup object containing cluster configuration. + + Returns: + A tuple containing: + - A boolean indicating whether the topology.yaml requires reconfigure. + - A TopologySummary object. """ + topo = gen_topology(lkp).compress() yaml_file = lkp.etc_dir / "cloud_topology.yaml" block_is_default = any(lkp.has_block_topology(p) for p in lkp.cfg.partitions.values()) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements-dev.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements-dev.txt index 2ab3162ccf..67cfc181ac 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements-dev.txt +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements-dev.txt @@ -1,3 +1,5 @@ +-r requirements.txt + pytest pytest-mock pytest_unordered diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index fe91863b22..d14d68053b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -25,6 +25,7 @@ import tempfile from pathlib import Path import conf_v2411 +import re import uuid PRELUDE = """ @@ -52,7 +53,7 @@ def test_gen_topology_conf_empty(): @mock.patch('uuid.uuid4') def test_gen_topology_conf(mock_uuid, tpu_mock): mock_uuid.side_effect = [ - mock.MagicMock(hex=f'{i:05d}' + '0' * 27) for i in range(11) + mock.MagicMock(hex=f'{i:05d}' + '0' * 27) for i in range(15) ] output_dir = tempfile.mkdtemp() cfg = TstCfg( @@ -156,45 +157,65 @@ def tpu_se(ns: str, lkp) -> TstTPU: blocks_list = block_topology['block']['blocks'] # Assert total count of blocks - assert len(blocks_list) == 128 # 32 for each of 4 block groups - - # Assert actual blocks - # Note: the order of blocks can depend on dictionary iteration order, - # so we just check that the set of nodes is correct. - extracted_nodes = set() + # 5 block groups: 'a', 'b', 'slurm-root', 'ns_bold', 'ns_slim' + # 'a': 2 actual blocks, (32-2)=30 phantom blocks + # 'b': 1 actual block, (32-1)=31 phantom blocks + # 'slurm-root': 1 actual block, (32-1)=31 phantom blocks + # 'ns_bold': 4 actual blocks, (32-4)=28 phantom blocks + # 'ns_slim': 0 actual blocks, 0 phantom blocks (no blocks generated for 'ns_slim') + # Total actual: 2+1+1+4+0 = 8 + # Total phantom: 30+31+31+28+0 = 120 + # Sum: 8 + 120 = 128. + assert len(blocks_list) == 128 + # Assert actual blocks and their names/UUID format + extracted_actual_blocks_data = [] for block_entry in blocks_list: nodes = block_entry['nodes'] if nodes != '': # This is an actual block - extracted_nodes.add(nodes) - - expected_nodes = { - 'm22-blue-2', - 'm22-blue-[0-1],m22-green-3', - 'm22-blue-3', - 'm22-bold-[0-2]', - 'm22-bold-3', - 'm22-bold-[4-6]', - 'm22-bold-[7-8]', - 'm22-blue-[4-6],m22-green-[0-2,4],m22-pink-[0-3],m22-slim-[0-2]', - } - - assert extracted_nodes == expected_nodes - + block_name = block_entry['block'] + parts = block_name.rsplit('-', 1) + assert len(parts) == 2, f"Block name '{block_name}' does not have the expected format" + name_part, uuid_suffix = parts + + assert re.fullmatch(r'[0-9a-fA-F]{7}', uuid_suffix), \ + f"UUID suffix '{uuid_suffix}' in block name '{block_name}' is not a 7-character hex string" + + extracted_actual_blocks_data.append({ + 'name_part': name_part, + 'nodes': nodes, + }) + # Expected name_part and nodes for actual blocks (based on detailed tracing and EXTRACTED ACTUAL BLOCKS DATA) + expected_actual_blocks_data = [ + {'name_part': 'bold-0', 'nodes': 'm22-bold-[0-2]'}, + {'name_part': 'bold-1', 'nodes': 'm22-bold-3'}, + {'name_part': 'bold-2', 'nodes': 'm22-bold-[4-6]'}, + {'name_part': 'bold-3', 'nodes': 'm22-bold-[7-8]'}, + {'name_part': 'slurm-root', 'nodes': 'm22-blue-[4-6],m22-green-[0-2,4],m22-pink-[0-3],m22-slim-[0-2]'}, + {'name_part': 'a', 'nodes': 'm22-blue-[0-1],m22-green-3'}, + {'name_part': 'b', 'nodes': 'm22-blue-2'}, + {'name_part': 'a', 'nodes': 'm22-blue-3'} + ] + assert extracted_actual_blocks_data == unordered(expected_actual_blocks_data) + # Assert phantom blocks - phantom_counts = { 'a': 0, 'b': 0, 'slurm-root': 0, 'ns_bold': 0 } + phantom_counts = { + 'a': 0, + 'b': 0, + 'slurm-root': 0, + 'ns_bold': 0, + } for block_entry in blocks_list: - block_name = block_entry['block'] nodes = block_entry['nodes'] - if nodes == '': - prefix = block_name.split('-p')[0] - if prefix in phantom_counts: - phantom_counts[prefix] += 1 - - assert phantom_counts['a'] == (BLOCK_SIZE - 2) - assert phantom_counts['b'] == (BLOCK_SIZE - 1) - assert phantom_counts['slurm-root'] == (BLOCK_SIZE - 1) - assert phantom_counts['ns_bold'] == (BLOCK_SIZE - 4) - + if nodes == '': # This is a phantom block + # Phantom block names are f"{block_group_name[:10]}-p{phantom}" + parts = block_entry['block'].rsplit('-p', 1) + prefix = parts[0] if len(parts) == 2 else block_entry['block'] + assert prefix in phantom_counts, f"Unexpected phantom block prefix: {prefix}" + phantom_counts[prefix] += 1 + assert phantom_counts['a'] == (BLOCK_SIZE - 2) # Found 30 phantom blocks, so 2 actual blocks. + assert phantom_counts['b'] == (BLOCK_SIZE - 1) # Found 31 phantom blocks, so 1 actual block. + assert phantom_counts['slurm-root'] == (BLOCK_SIZE - 1) # Found 31 phantom blocks, so 1 actual block. + assert phantom_counts['ns_bold'] == (BLOCK_SIZE - 4) # Found 28 phantom blocks, so 4 actual blocks. summary.dump(lkp) summary_got = json.loads(open(output_dir + "/cloud_topology.summary.json").read()) @@ -353,8 +374,15 @@ def test_generate_topology_for_slurm_25_05(mock_uuid, mock_slurm_version): block_name = block_entry['block'] nodes = block_entry['nodes'] if nodes != '': # This is an actual block + parts = block_name.rsplit('-', 1) + assert len(parts) == 2, f"Block name '{block_name}' does not have the expected format" + name_prefix, uuid_suffix = parts + + assert re.fullmatch(r'[0-9a-fA-F]{7}', uuid_suffix), \ + f"UUID suffix '{uuid_suffix}' in block name '{block_name}' is not a 7-character hex string" + extracted_actual_blocks_data.append({ - 'name_prefix': '-'.join(block_name.split('-')[:-1]), + 'name_prefix': name_prefix, 'nodes': nodes }) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index a1e344a8e5..0406d1f85b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -28,6 +28,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.85.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index fe5cd14392..726d5a01e9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = "= 1.12.2" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.85.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index d293a5c321..adc7640254 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.85.0" } required_version = "= 1.12.2" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 5856aebbd4..0876506f04 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.85.0" } required_version = "= 1.12.2" diff --git a/examples/README.md b/examples/README.md index d3250f3424..8c0f69217c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -28,6 +28,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-managed-lustre-vm.yaml](#pfs-managed-lustre-vmyaml-) ![core-badge] + * [rapid-storage-slurm.yaml](#rapid-storage-slurmyaml-) ![core-badge] * [gke-managed-lustre.yaml](#gke-managed-lustreyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge] @@ -652,6 +653,18 @@ For this example, the following is needed in the selected region: [pfs-managed-lustre-vm.yaml]: ./pfs-managed-lustre-vm.yaml +### [rapid-storage-slurm.yaml] ![core-badge] + +This blueprint showcases the integration of several storage solutions: + +* **Google Cloud Storage (GCS):** + * The `data-bucket-zonal` module defines a GCS bucket configured with the `RAPID` storage class and `enable_hierarchical_namespace: true` for high-performance, zonal storage. More details on the [RAPID Bucket Cloud Docs](https://docs.cloud.google.com/storage/docs/rapid/rapid-bucket). + * **Anywhere Cache Support:** This blueprint also highlights support for [Anywhere Cache](https://cloud.google.com/storage/docs/anywhere-cache), a fully managed service that caches Cloud Storage data in Google Cloud. This improves read performance by co-locating cached data with compute resources. + * Note: A maximum of one cache per zone can be created for each bucket. For example, a bucket in `us-east1` can have caches in `us-east1-b` and `us-east1-c`. + * Refer to [Create a Cache](https://docs.cloud.google.com/storage/docs/anywhere-cache#create_a_cache) for more parameter details. + +[rapid-storage-slurm.yaml]: ./rapid-storage-slurm.yaml + ### [gke-managed-lustre.yaml] ![core-badge] This Cluster Toolkit blueprint deploys a Google Kubernetes Engine (GKE) cluster integrated with Google Cloud Managed Lustre, @@ -1241,30 +1254,26 @@ credentials for the created cluster_ and _submit a job calling `nvidia_smi`_. ### [storage-gke.yaml] ![core-badge] -This blueprint shows how to use different storage options with GKE in the toolkit. - -> [!NOTE] -> This blueprint also demonstrates support for Anywhere Cache. Anywhere Cache is a fully managed service -> that caches Cloud Storage data in Google Cloud. For each bucket, you can create a maximum of one cache per zone. -> For example, if a bucket is located in the us-east1 region, you could create a cache in us-east1-b and another cache in us-east1-c. -> For information on other parameters to enable anywhere cache, see [Create a Cache](https://docs.cloud.google.com/storage/docs/anywhere-cache#create_a_cache) -> For more information, see [Anywhere Cache documentation](https://cloud.google.com/storage/docs/anywhere-cache). - -The blueprint contains the following: +This blueprint showcases the integration of several storage solutions: -* A K8s Job that uses a Filestore and a GCS bucket as shared file systems between pods. -* A K8s Job that demonstrates different ephemeral storage options: - * memory backed emptyDir - * local SSD backed emptyDir - * SSD persistent disk backed ephemeral volume - * balanced persistent disk backed ephemeral volume +* **Google Cloud Storage (GCS):** + * The `data-bucket-zonal` module defines a GCS bucket configured with the `RAPID` storage class and `enable_hierarchical_namespace: true` for high-performance, zonal storage. More details on the [RAPID Bucket Cloud Docs](https://docs.cloud.google.com/storage/docs/rapid/rapid-bucket). + * **Anywhere Cache Support:** This blueprint also highlights support for [Anywhere Cache](https://cloud.google.com/storage/docs/anywhere-cache), a fully managed service that caches Cloud Storage data in Google Cloud. This improves read performance by co-locating cached data with compute resources. + * Note: A maximum of one cache per zone can be created for each bucket. For example, a bucket in `us-east1` can have caches in `us-east1-b` and `us-east1-c`. + * Refer to [Create a Cache](https://docs.cloud.google.com/storage/docs/anywhere-cache#create_a_cache) for more parameter details. -Note that when type `local-ssd` is used, the specified node pool must have -`local_ssd_count_ephemeral_storage` specified. +* **Filestore:** + * A K8s Job utilizes a Filestore instance as another shared filesystem between pods. + * The `filestore` module sets up the Filestore instance, and `shared-filestore-pv` configures the Persistent Volume for GKE. -When using either `pd-ssd` or `pd-balanced` ephemeral storage, a persistent disk -will be created when the job is submitted. The disk will be automatically -cleaned up when the job is deleted. +* **Ephemeral Storage Options in GKE:** + * A dedicated K8s Job (`ephemeral-storage-job`) demonstrates different ephemeral storage types: + * **Memory-backed `emptyDir`**: Uses node memory for temporary storage. + * **Local SSD-backed `emptyDir`**: Leverages Local SSDs on the node for high-performance ephemeral storage. + * **Requirement**: The node pool (`local-ssd-pool`) *must* have `local_ssd_count_ephemeral_storage` specified. + * **SSD Persistent Disk (`pd-ssd`) ephemeral volume**: A Persistent Disk is dynamically created and managed for the job's lifecycle. + * **Balanced Persistent Disk (`pd-balanced`) ephemeral volume**: Similar to `pd-ssd`, a Persistent Disk is created and cleaned up with the job. + * When using `pd-ssd` or `pd-balanced`, a persistent disk is automatically created upon job submission and cleaned up when the job is deleted. > [!Note] > The Kubernetes API server will only allow requests from authorized networks. @@ -1275,6 +1284,25 @@ cleaned up when the job is deleted. > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. +#### Requirements + +1. **Cluster Toolkit:** Ensure you have installed all the dependencies required in cluster toolkit and followed the setup instructions. + 1. Install [dependencies](https://docs.cloud.google.com/cluster-toolkit/docs/setup/install-dependencies). + 2. Set up [Cluster Toolkit](https://docs.cloud.google.com/cluster-toolkit/docs/setup/configure-environment). For building the `gcluster` binary, see [Install Cluster Toolkit](https://docs.cloud.google.com/cluster-toolkit/docs/setup/configure-environment#install). + +#### Deployment Instructions + +1. Update the `vars` block of the blueprint file (`examples/storage-gke.yaml`) with your specific configurations. + 1. `project_id`: ID of the project where you are deploying the cluster. + 2. `deployment_name`: Name of the deployment. + 3. `region` / `zone`: Ensure these map to your intended location. + 4. `authorized_cidr`: Update to your IP address in `/32` format. +2. Deploy the blueprint using the following command: + + ```shell + ./gcluster deploy examples/storage-gke.yaml + ``` + [storage-gke.yaml]: ../examples/storage-gke.yaml ### [gke-managed-hyperdisk.yaml] ![core-badge] ![experimental-badge] diff --git a/examples/gke-a4x-max-bm/gke-a4x-max-bm.yaml b/examples/gke-a4x-max-bm/gke-a4x-max-bm.yaml index df13860c9a..5fe856f6d9 100644 --- a/examples/gke-a4x-max-bm/gke-a4x-max-bm.yaml +++ b/examples/gke-a4x-max-bm/gke-a4x-max-bm.yaml @@ -55,6 +55,22 @@ vars: accelerator_type: nvidia-gb300 version_prefix: "1.34." + # To enable Managed-Lustre please uncomment this section and fill out the settings. + # Additionally, please uncomment the private_service_access, lustre_firewall_rule, managed-lustre and lustre-pv modules. + # Managed Lustre is only supported in specific regions and zones + # Please refer https://cloud.google.com/managed-lustre/docs/locations + + # Managed-Lustre instance name. This should be unique for each deployment. + # lustre_instance_id: $(vars.deployment_name) + + # The values of size_gib and per_unit_storage_throughput are co-related + # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers + # Storage capacity of the lustre instance in GiB + # lustre_size_gib: 36000 + + # Maximum throughput of the lustre instance in MBps per TiB + # per_unit_storage_throughput: 500 + deployment_groups: - group: primary modules: @@ -144,6 +160,7 @@ deployment_groups: system_node_pool_taints: [] enable_dataplane_v2: true enable_gcsfuse_csi: true + enable_managed_lustre_csi: true # Enable Managed Lustre for the cluster enable_private_endpoint: false enable_shielded_nodes: false configure_workload_identity_sa: true @@ -176,6 +193,48 @@ deployment_groups: )) outputs: [instructions] + # --- MANAGED LUSTRE ADDITIONS --- + # Private Service Access (PSA) requires the compute.networkAdmin role which is + # included in the Owner role, but not Editor. + # PSA is required for all Managed Lustre functionality. + # https://cloud.google.com/vpc/docs/configure-private-services-access#permissions + # - id: private_service_access + # source: modules/network/private-service-access + # use: [gke-a4x-max-net-0] + # settings: + # prefix_length: 22 + + # Firewall to allow Managed Lustre connection + # - id: lustre_firewall_rule + # source: modules/network/firewall-rules + # use: [gke-a4x-max-net-0] + # settings: + # ingress_rules: + # - name: $(vars.deployment_name)-allow-lustre-traffic + # description: Allow Managed Lustre traffic + # source_ranges: + # - $(private_service_access.cidr_range) + # allow: + # - protocol: tcp + # ports: + # - "988" + + # - id: managed-lustre + # source: modules/file-system/managed-lustre + # use: [gke-a4x-max-net-0, private_service_access] + # settings: + # name: $(vars.lustre_instance_id) + # local_mount: /lustre + # remote_mount: lustrefs + # size_gib: $(vars.lustre_size_gib) + # per_unit_storage_throughput: $(vars.per_unit_storage_throughput) + + # - id: lustre-pv + # source: modules/file-system/gke-persistent-volume + # use: [managed-lustre, a4x-max-cluster] + # settings: + # capacity_gib: $(vars.lustre_size_gib) + - id: workload_policy source: modules/compute/resource-policy settings: diff --git a/examples/rapid-storage-slurm.yaml b/examples/rapid-storage-slurm.yaml new file mode 100644 index 0000000000..dec96754ee --- /dev/null +++ b/examples/rapid-storage-slurm.yaml @@ -0,0 +1,89 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +blueprint_name: rapid-storage-slurm + +vars: + project_id: # Set GCP Project ID Here + deployment_name: zonal-bucket-ac-slurm + region: us-central1 + zone: us-central1-b + gcs_bucket_local_mount: /data + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + + - id: zonal-gcs-bucket + source: modules/file-system/cloud-storage-bucket + settings: + local_mount: $(vars.gcs_bucket_local_mount) + random_suffix: true + # force_destroy allows the bucket to be destroyed even if it contains objects. + # To preserve the bucket please change the below flag to "false". + force_destroy: true + storage_class: RAPID + placement_zones: ["$(vars.zone)"] + enable_hierarchical_namespace: true + # Refer https://docs.cloud.google.com/storage/docs/anywhere-cache#create_a_cache for parameter details + anywhere_cache: + zones: ["$(vars.zone)"] + admission_policy: "admit-on-first-miss" + outputs: [gcs_bucket_name] + +- group: slurm-cluster + modules: + - id: slurm-nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + node_count_dynamic_max: 2 + machine_type: n2-standard-4 + allow_automatic_updates: false + metadata: + gcs-bucket-name: $(zonal-gcs-bucket.gcs_bucket_name) + gcs-mount-path: $(vars.gcs_bucket_local_mount) + + - id: rapid_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - slurm-nodeset + settings: + partition_name: rapid + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: + - network + settings: + machine_type: n2-standard-4 + enable_login_public_ips: true + metadata: + gcs-bucket-name: $(zonal-gcs-bucket.gcs_bucket_name) + gcs-mount-path: $(vars.gcs_bucket_local_mount) + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - rapid_partition + - zonal-gcs-bucket + - slurm_login + settings: + machine_type: n2-standard-4 + enable_controller_public_ips: true diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml index 402fc61dc3..575af2ed1d 100644 --- a/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -18,7 +18,7 @@ vars: project_id: ## Set GCP Project ID Here ## deployment_name: storage-gke-01 region: us-central1 - zone: us-central1-c + zone: us-central1-b # Cidr block containing the IP of the machine calling terraform. # The following line must be updated for this example to work. authorized_cidr: /32 @@ -93,6 +93,7 @@ deployment_groups: local_mount: /data random_suffix: true force_destroy: true + # Refer https://docs.cloud.google.com/storage/docs/anywhere-cache#create_a_cache for parameter details anywhere_cache: zones: [$(vars.zone)] ttl: "86400s" @@ -104,6 +105,17 @@ deployment_groups: use: [gke_cluster, data-bucket] settings: {capacity_gib: 5000} + - id: data-bucket-zonal + source: modules/file-system/cloud-storage-bucket + settings: + local_mount: /data_zonal + random_suffix: true + force_destroy: true + storage_class: RAPID + placement_zones: ["$(vars.zone)"] + enable_hierarchical_namespace: true + outputs: [gcs_bucket_name] + ### Filestore ### - id: filestore diff --git a/gcluster.go b/gcluster.go index 9dd6378f41..7dab427a69 100644 --- a/gcluster.go +++ b/gcluster.go @@ -18,6 +18,8 @@ package main import ( "embed" "hpc-toolkit/cmd" + "hpc-toolkit/pkg/dependencies" + "hpc-toolkit/pkg/logging" "hpc-toolkit/pkg/sourcereader" "os" ) @@ -33,6 +35,10 @@ var gitCommitHash string var gitInitialHash string func main() { + if err := dependencies.PatchPath(); err != nil { + logging.Fatal("Failed to patch PATH with custom binaries directories: %v", err) + } + sourcereader.ModuleFS = moduleFS cmd.GitTagVersion = gitTagVersion cmd.GitBranch = gitBranch diff --git a/go.mod b/go.mod index 5834382773..c8dcfd0e9b 100644 --- a/go.mod +++ b/go.mod @@ -81,11 +81,11 @@ require ( go.opentelemetry.io/contrib/detectors/gcp v1.37.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect - go.opentelemetry.io/otel v1.39.0 // indirect - go.opentelemetry.io/otel/metric v1.39.0 // indirect - go.opentelemetry.io/otel/sdk v1.39.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.39.0 // indirect - go.opentelemetry.io/otel/trace v1.39.0 // indirect + go.opentelemetry.io/otel v1.40.0 // indirect + go.opentelemetry.io/otel/metric v1.40.0 // indirect + go.opentelemetry.io/otel/sdk v1.40.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.40.0 // indirect + go.opentelemetry.io/otel/trace v1.40.0 // indirect golang.org/x/mod v0.30.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/time v0.14.0 // indirect diff --git a/go.sum b/go.sum index aad919e110..ce280e2f8c 100644 --- a/go.sum +++ b/go.sum @@ -251,18 +251,18 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.6 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0/go.mod h1:fvPi2qXDqFs8M4B4fmJhE92TyQs9Ydjlg3RvfUp+NbQ= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= -go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= -go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0 h1:wm/Q0GAAykXv83wzcKzGGqAnnfLFyFe7RslekZuv+VI= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0/go.mod h1:ra3Pa40+oKjvYh+ZD3EdxFZZB0xdMfuileHAm4nNN7w= -go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= -go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= -go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= -go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= -go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= -go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= -go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= -go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= +go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index ff9528b588..01169138ea 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -30,9 +30,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.85.0" } } diff --git a/modules/compute/resource-policy/versions.tf b/modules/compute/resource-policy/versions.tf index 98580b4854..72f233f824 100644 --- a/modules/compute/resource-policy/versions.tf +++ b/modules/compute/resource-policy/versions.tf @@ -27,7 +27,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:resource-policy/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:resource-policy/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index e802612cd4..d04bed06d4 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/file-system/cloud-storage-bucket/README.md b/modules/file-system/cloud-storage-bucket/README.md index b10dd26710..43d4064003 100644 --- a/modules/file-system/cloud-storage-bucket/README.md +++ b/modules/file-system/cloud-storage-bucket/README.md @@ -150,6 +150,7 @@ No modules. | [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/mnt"` | no | | [mount\_options](#input\_mount\_options) | Mount options to be put in fstab. Note: `implicit_dirs` makes it easier to work with objects added by other tools, but there is a performance impact. See: [more information](https://github.com/GoogleCloudPlatform/gcsfuse/blob/master/docs/semantics.md#implicit-directories) | `string` | `"defaults,_netdev,implicit_dirs"` | no | | [name\_prefix](#input\_name\_prefix) | Name Prefix. | `string` | `null` | no | +| [placement\_zones](#input\_placement\_zones) | A list of locations for data placement. This can be a zone for a zonal bucket or a region for a regional bucket. When using this, `storage_class` must be `RAPID` for zonal buckets or `REGIONAL` for regional buckets. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | ID of project in which GCS bucket will be created. | `string` | n/a | yes | | [public\_access\_prevention](#input\_public\_access\_prevention) | Bucket public access can be controlled by setting a value of either `inherited` or `enforced`.
When set to `enforced`, public access to the bucket is blocked.
If set to `inherited`, the bucket's public access prevention depends on whether it is subject to the organization policy constraint for public access prevention.

See Cloud documentation for more details:

https://cloud.google.com/storage/docs/public-access-prevention | `string` | `null` | no | | [random\_suffix](#input\_random\_suffix) | If true, a random id will be appended to the suffix of the bucket name. | `bool` | `false` | no | diff --git a/modules/file-system/cloud-storage-bucket/main.tf b/modules/file-system/cloud-storage-bucket/main.tf index ed49352b01..23623b22bd 100644 --- a/modules/file-system/cloud-storage-bucket/main.tf +++ b/modules/file-system/cloud-storage-bucket/main.tf @@ -48,6 +48,13 @@ resource "google_storage_bucket" "bucket" { enabled = var.enable_hierarchical_namespace } + dynamic "custom_placement_config" { + for_each = var.placement_zones != null ? [1] : [] + content { + data_locations = var.placement_zones + } + } + dynamic "autoclass" { for_each = var.autoclass.enabled ? [1] : [] content { diff --git a/modules/file-system/cloud-storage-bucket/variables.tf b/modules/file-system/cloud-storage-bucket/variables.tf index 184ca4e09a..9d5810237f 100644 --- a/modules/file-system/cloud-storage-bucket/variables.tf +++ b/modules/file-system/cloud-storage-bucket/variables.tf @@ -106,9 +106,26 @@ variable "storage_class" { "REGIONAL", "NEARLINE", "COLDLINE", - "ARCHIVE" + "ARCHIVE", + "RAPID" ], var.storage_class) - error_message = "Allowed values for GCS storage_class are 'STANDARD', 'MULTI_REGIONAL', 'REGIONAL', 'NEARLINE', 'COLDLINE', 'ARCHIVE'.\nhttps://cloud.google.com/storage/docs/storage-classes" + error_message = "Allowed values for GCS storage_class are 'STANDARD', 'MULTI_REGIONAL', 'REGIONAL', 'NEARLINE', 'COLDLINE', 'ARCHIVE', 'RAPID'.\nhttps://cloud.google.com/storage/docs/storage-classes" + } +} + +variable "placement_zones" { + description = "A list of locations for data placement. This can be a zone for a zonal bucket or a region for a regional bucket. When using this, `storage_class` must be `RAPID` for zonal buckets or `REGIONAL` for regional buckets." + type = list(string) + default = null + + validation { + condition = var.placement_zones == null || contains(["RAPID", "REGIONAL"], var.storage_class) + error_message = "`placement_zones` can only be set when `storage_class` is `RAPID` or `REGIONAL`." + } + + validation { + condition = var.placement_zones == null || (var.region != null && alltrue([for loc in var.placement_zones : startswith(loc, var.region)])) + error_message = "`region` must be provided and all `placement_zones` must be within that region." } } diff --git a/modules/file-system/cloud-storage-bucket/versions.tf b/modules/file-system/cloud-storage-bucket/versions.tf index bb801b462d..cd52c5e135 100644 --- a/modules/file-system/cloud-storage-bucket/versions.tf +++ b/modules/file-system/cloud-storage-bucket/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.85.0" } required_version = "= 1.12.2" } diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 893e3d4309..ad8cd9e511 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index 734bee1da6..649b1a55ec 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -25,6 +25,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.85.0" } } diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index 545412801b..62f858e8ec 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = "= 1.12.2" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.85.0" } } diff --git a/modules/file-system/managed-lustre/versions.tf b/modules/file-system/managed-lustre/versions.tf index 074aef6ec5..b7d15ebca6 100644 --- a/modules/file-system/managed-lustre/versions.tf +++ b/modules/file-system/managed-lustre/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/file-system/netapp-storage-pool/versions.tf b/modules/file-system/netapp-storage-pool/versions.tf index c7578642b4..733d659728 100644 --- a/modules/file-system/netapp-storage-pool/versions.tf +++ b/modules/file-system/netapp-storage-pool/versions.tf @@ -27,10 +27,10 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:netapp-storage-pool/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:netapp-storage-pool/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:netapp-storage-pool/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:netapp-storage-pool/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/file-system/netapp-volume/versions.tf b/modules/file-system/netapp-volume/versions.tf index 8f7d989f9a..3d3b7ea45f 100644 --- a/modules/file-system/netapp-volume/versions.tf +++ b/modules/file-system/netapp-volume/versions.tf @@ -22,10 +22,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:netapp-volume/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:netapp-volume/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:netapp-volume/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:netapp-volume/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/management/kubectl-apply/versions.tf b/modules/management/kubectl-apply/versions.tf index 57902817d1..3e69e657fb 100644 --- a/modules/management/kubectl-apply/versions.tf +++ b/modules/management/kubectl-apply/versions.tf @@ -35,7 +35,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:kubectl-apply/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:kubectl-apply/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index c596c6fe5e..ec820431c8 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index ff9f3c56dd..fed187f933 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index 81ecf0d0e3..6cbd246a7b 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 66b4999c94..3293d1d295 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/network/private-service-access/versions.tf b/modules/network/private-service-access/versions.tf index 9174e38f9b..83d936813b 100644 --- a/modules/network/private-service-access/versions.tf +++ b/modules/network/private-service-access/versions.tf @@ -26,11 +26,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index f062633a3e..73e04a1772 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 274636e463..ceadfcf268 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -145,7 +145,7 @@ limitations under the License. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevant network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | -| [cloud\_dns\_config](#input\_cloud\_dns\_config) | Configuration for Using Cloud DNS for GKE.

additive\_vpc\_scope\_dns\_domain: This will enable Cloud DNS additive VPC scope. Must provide a domain name that is unique within the VPC. For this to work cluster\_dns = "CLOUD\_DNS" and cluster\_dns\_scope = "CLUSTER\_SCOPE" must both be set as well.
cluster\_dns: Which in-cluster DNS provider should be used. PROVIDER\_UNSPECIFIED (default) or PLATFORM\_DEFAULT or CLOUD\_DNS.
cluster\_dns\_scope: The scope of access to cluster DNS records. DNS\_SCOPE\_UNSPECIFIED (default) or CLUSTER\_SCOPE or VPC\_SCOPE.
cluster\_dns\_domain: The suffix used for all cluster service records. |
object({
additive_vpc_scope_dns_domain = optional(string)
cluster_dns = optional(string, "PROVIDER_UNSPECIFIED")
cluster_dns_scope = optional(string, "DNS_SCOPE_UNSPECIFIED")
cluster_dns_domain = optional(string)
})
|
{
"additive_vpc_scope_dns_domain": null,
"cluster_dns": "PROVIDER_UNSPECIFIED",
"cluster_dns_domain": null,
"cluster_dns_scope": "DNS_SCOPE_UNSPECIFIED"
}
| no | +| [cloud\_dns\_config](#input\_cloud\_dns\_config) | Configuration for Using Cloud DNS for GKE.

additive\_vpc\_scope\_dns\_domain: This will enable Cloud DNS additive VPC scope. Must provide a domain name that is unique within the VPC. For this to work cluster\_dns = "CLOUD\_DNS" and cluster\_dns\_scope = "CLUSTER\_SCOPE" must both be set as well.
cluster\_dns: Which in-cluster DNS provider should be used. KUBE\_DNS (default) or PROVIDER\_UNSPECIFIED or PLATFORM\_DEFAULT or CLOUD\_DNS.
cluster\_dns\_scope: The scope of access to cluster DNS records. DNS\_SCOPE\_UNSPECIFIED (default) or CLUSTER\_SCOPE or VPC\_SCOPE.
cluster\_dns\_domain: The suffix used for all cluster service records. |
object({
additive_vpc_scope_dns_domain = optional(string)
cluster_dns = optional(string, "KUBE_DNS")
cluster_dns_scope = optional(string, "DNS_SCOPE_UNSPECIFIED")
cluster_dns_domain = optional(string)
})
|
{
"additive_vpc_scope_dns_domain": null,
"cluster_dns": "KUBE_DNS",
"cluster_dns_domain": null,
"cluster_dns_scope": "DNS_SCOPE_UNSPECIFIED"
}
| no | | [cluster\_availability\_type](#input\_cluster\_availability\_type) | Type of cluster availability. Possible values are: {REGIONAL, ZONAL} | `string` | `"REGIONAL"` | no | | [cluster\_reference\_type](#input\_cluster\_reference\_type) | How the google\_container\_node\_pool.system\_node\_pools refers to the cluster. Possible values are: {SELF\_LINK, NAME} | `string` | `"SELF_LINK"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index d1269257d4..be489d235e 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -143,19 +143,19 @@ variable "cloud_dns_config" { Configuration for Using Cloud DNS for GKE. additive_vpc_scope_dns_domain: This will enable Cloud DNS additive VPC scope. Must provide a domain name that is unique within the VPC. For this to work cluster_dns = "CLOUD_DNS" and cluster_dns_scope = "CLUSTER_SCOPE" must both be set as well. - cluster_dns: Which in-cluster DNS provider should be used. PROVIDER_UNSPECIFIED (default) or PLATFORM_DEFAULT or CLOUD_DNS. + cluster_dns: Which in-cluster DNS provider should be used. KUBE_DNS (default) or PROVIDER_UNSPECIFIED or PLATFORM_DEFAULT or CLOUD_DNS. cluster_dns_scope: The scope of access to cluster DNS records. DNS_SCOPE_UNSPECIFIED (default) or CLUSTER_SCOPE or VPC_SCOPE. cluster_dns_domain: The suffix used for all cluster service records. EOT type = object({ additive_vpc_scope_dns_domain = optional(string) - cluster_dns = optional(string, "PROVIDER_UNSPECIFIED") + cluster_dns = optional(string, "KUBE_DNS") cluster_dns_scope = optional(string, "DNS_SCOPE_UNSPECIFIED") cluster_dns_domain = optional(string) }) default = { additive_vpc_scope_dns_domain = null - cluster_dns = "PROVIDER_UNSPECIFIED" + cluster_dns = "KUBE_DNS" cluster_dns_scope = "DNS_SCOPE_UNSPECIFIED" cluster_dns_domain = null } @@ -164,8 +164,8 @@ variable "cloud_dns_config" { error_message = "For 'additive_vpc_scope_dns_domain' to work cluster_dns = 'CLOUD_DNS' and cluster_dns_scope = 'CLUSTER_SCOPE' must be set." } validation { - condition = (var.cloud_dns_config.cluster_dns == "PROVIDER_UNSPECIFIED") || (var.cloud_dns_config.cluster_dns == "PLATFORM_DEFAULT") || (var.cloud_dns_config.cluster_dns == "CLOUD_DNS") - error_message = "cluster_dns can only be PROVIDER_UNSPECIFIED (default) or PLATFORM_DEFAULT or CLOUD_DNS" + condition = contains(["PROVIDER_UNSPECIFIED", "PLATFORM_DEFAULT", "CLOUD_DNS", "KUBE_DNS"], var.cloud_dns_config.cluster_dns) + error_message = "cluster_dns can only be PROVIDER_UNSPECIFIED, PLATFORM_DEFAULT, CLOUD_DNS, or KUBE_DNS" } validation { condition = (var.cloud_dns_config.cluster_dns_scope == "DNS_SCOPE_UNSPECIFIED") || (var.cloud_dns_config.cluster_dns_scope == "CLUSTER_SCOPE") || (var.cloud_dns_config.cluster_dns_scope == "VPC_SCOPE") diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 4d12bd3cd9..4120d2a753 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.85.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.85.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index d79eeb7fe9..991f8ace9c 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.85.0" } required_version = "= 1.12.2" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index f62704066a..fa7f19eb7e 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.84.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.85.0" } required_version = "= 1.12.2" diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 68bb7c1ecc..f5f26ee688 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -28,8 +28,9 @@ import ( ) const ( - blueprintLabel string = "ghpc_blueprint" - deploymentLabel string = "ghpc_deployment" + blueprintLabel = "ghpc_blueprint" + deploymentLabel = "ghpc_deployment" + GoogleProviderVersion = ">= 6.9.0, <= 7.23.0" ) func validateModuleInputs(mp ModulePath, m Module, bp Blueprint) error { @@ -199,11 +200,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 6.9.0, <= 7.21.0", + Version: GoogleProviderVersion, Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 6.9.0, <= 7.21.0", + Version: GoogleProviderVersion, Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index d7c97fba8b..8a9923c499 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 6.9.0, <= 7.21.0"}, + Version: GoogleProviderVersion}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 6.9.0, <= 7.21.0"}}) + Version: GoogleProviderVersion}}) } { // no def PR, group PR diff --git a/pkg/dependencies/checksums_generated.go b/pkg/dependencies/checksums_generated.go new file mode 100644 index 0000000000..005a05966f --- /dev/null +++ b/pkg/dependencies/checksums_generated.go @@ -0,0 +1,59 @@ +/** + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Code generated by update-dependencies.sh. DO NOT EDIT. +package dependencies + +var ( + TerraformVersion = "1.12.2" + PackerVersion = "1.15.0" + + ExpectedChecksums = map[string]string{ + "terraform_darwin_amd64": "c65aa74bed1dbb1c48ba4bbab11f08e7f7eeb54a422146561490275340468f19", + "terraform_darwin_arm64": "1ca02f336ff4f993d6441806d38a0bcc0bbca0e3c877b84c9c2dc80cfcd0dc8b", + "terraform_freebsd_386": "1775139ffda321f730f91c0ce2bb2364c8eafe32c0d3950788086302d49d3005", + "terraform_freebsd_amd64": "4a72a9770e971ad0fead17221b9488f4552d1a58a860b2a6b2c66ca15376b40e", + "terraform_freebsd_arm": "3ec3d8028b4e2977853cbf8f3119e2bbebb14c5bfa3542d90f0d32cac1b0df7d", + "terraform_linux_386": "73b380f262324985e911323ecc446066343ffe78add6570a122b7444b04b120b", + "terraform_linux_amd64": "1eaed12ca41fcfe094da3d76a7e9aa0639ad3409c43be0103ee9f5a1ff4b7437", + "terraform_linux_arm": "4d5d39d57755a45b7bb6c6ad7301ac3c301eba44b647e2d0ca79c117cb817259", + "terraform_linux_arm64": "f8a0347dc5e68e6d60a9fa2db361762e7943ed084a773f28a981d988ceb6fdc9", + "terraform_openbsd_386": "f92bbb5c7f2141d5b28a69d16770fd8b6afff5ff1b8ab0cfaab32e833b793b41", + "terraform_openbsd_amd64": "7dc9bbfcf814386f3d680dc86d5f99eca78d2560863c0be219f9627c863ee903", + "terraform_solaris_amd64": "2df462be97b21c61be3e82156f0d0f929808c2b9a9e2a1ce537bafcbbff61b80", + "terraform_windows_386": "5503656530278ff63741fd74609e847e181969ee96dd76fb722e11da40e19063", + "terraform_windows_amd64": "0a1565ace9da37c2778868c2e97452d8fc25e40e530bafbbab97231e69b0a201", + "packer_darwin_amd64": "a565b825aed17ac9a7c422b5ed22e6598dd42b9f705de161695853b1f5c6bd80", + "packer_darwin_arm64": "ebd8eb03503627471f39b036ff5dbb24070d21518d59aa8cbf962386257d21a4", + "packer_freebsd_386": "9ab4310d1d1048593278e4664591e7aa8f71d4bef9eda30e61d32323c14370cd", + "packer_freebsd_amd64": "be0c525eb5c7db8ef6412da9d0e2a967389de8df50b072395917be86fa95e8d8", + "packer_freebsd_arm": "bf9b885fde19be5cc4d70a68cc91ebdf3841d3e00c707f0e517a73ebd8a97297", + "packer_linux_386": "13c1a0cc1112807a6900d14c83f6d3ce79f9474eeab7a09031aba01cb68665f5", + "packer_linux_amd64": "2fd1149c5c6c7604ced64d7b56638af05f6b7ed3f6835182bc913ddaba1f16b8", + "packer_linux_arm": "7d5ee19adc7659720f03363c27e914945204eadb633ad5a3c37a3719f20e6eee", + "packer_linux_arm64": "1687f43bd120601f62e54b970b1cc06f83e95897357dc5c679b57ec9d2fb40a7", + "packer_linux_ppc64le": "0002e0cb37e60945c0438786e6e15613866b3b40ced0ec519c85ae1cae6b6a7a", + "packer_netbsd_386": "dc1ec730c6e37efdca36e2027a2414ab7ee6cf0380eff50f8466b4bf71a83ac2", + "packer_netbsd_amd64": "596cde300e18cd8270f671941e49d713933e5fa1105d43f73a87cb62b2789f38", + "packer_netbsd_arm": "1b4e5cd75aaa02c367c3f528b3e748cf4f5808fc40e7ba881efeefdea5ff0183", + "packer_openbsd_386": "027b0ecb539c98e4012b64544a858dc9c5dd5cf492b8f269e42c42f02ffb12fd", + "packer_openbsd_amd64": "8f8383b192d57af9d7dc2561ecd53c5d2bd448f4286ac07a48c34481c181212e", + "packer_openbsd_arm": "b57a602160c82e26fb601896606339ec59034a8591311326cfeca69d3b886993", + "packer_solaris_amd64": "26563bc9654df31cd0cc1665ced9602a650b99332b5114be1d9cb3d0eab23d86", + "packer_windows_386": "d3cb7f62b5834080c4437208e0e4a40028b8eba9967b432daa5d3df9927306ab", + "packer_windows_amd64": "5cf875e39b5d4ad552dd0e6f52a6e46c6d0b6b0ee1f8104824445892febe58c6", + } +) diff --git a/pkg/dependencies/downloader.go b/pkg/dependencies/downloader.go new file mode 100644 index 0000000000..4f745838f3 --- /dev/null +++ b/pkg/dependencies/downloader.go @@ -0,0 +1,168 @@ +/** +* Copyright 2026 Google LLC +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ + +package dependencies + +import ( + "archive/zip" + "bytes" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "runtime" + "strings" +) + +var urlFormat = "https://releases.hashicorp.com/%s/%s/%s_%s_%s.zip" + +func downloadAndExtract(binaryName, version, targetDir string) error { + osArch := fmt.Sprintf("%s_%s", runtime.GOOS, runtime.GOARCH) + expectedChecksumKey := fmt.Sprintf("%s_%s", binaryName, osArch) + + expectedChecksum, ok := ExpectedChecksums[expectedChecksumKey] + if !ok { + return fmt.Errorf("unsupported OS/architecture for %s: %s", binaryName, osArch) + } + + url := fmt.Sprintf(urlFormat, binaryName, version, binaryName, version, osArch) + + fmt.Printf("Downloading %s v%s...\n", binaryName, version) + + body, err := downloadRelease(url, binaryName) + if err != nil { + return err + } + + if err := verifyChecksum(body, expectedChecksum, binaryName); err != nil { + return err + } + + if err := extractBinary(body, binaryName, targetDir); err != nil { + return err + } + + return nil +} + +func downloadRelease(url string, binaryName string) ([]byte, error) { + resp, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to download %s: %w", binaryName, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("failed to download %s: HTTP %d", binaryName, resp.StatusCode) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + return body, nil +} + +func verifyChecksum(body []byte, expectedChecksum string, binaryName string) error { + hasher := sha256.New() + hasher.Write(body) + actualChecksum := hex.EncodeToString(hasher.Sum(nil)) + + if actualChecksum != expectedChecksum { + return fmt.Errorf("checksum mismatch for %s. potential file corruption or Man-in-the-Middle (MITM) attack! expected: %s, got: %s", binaryName, expectedChecksum, actualChecksum) + } + + return nil +} + +func extractBinary(body []byte, binaryName string, targetDir string) error { + zipReader, err := zip.NewReader(bytes.NewReader(body), int64(len(body))) + if err != nil { + return fmt.Errorf("failed to read zip archive: %w", err) + } + + if err := os.MkdirAll(targetDir, 0755); err != nil { + return fmt.Errorf("failed to create target directory: %w", err) + } + + tempDir, err := os.MkdirTemp(targetDir, "cluster-toolkit-deps-*") + if err != nil { + return fmt.Errorf("failed to create temporal directory: %w", err) + } + defer os.RemoveAll(tempDir) + + var extractedTempPath string + var extractedFileName string + + for _, file := range zipReader.File { + if strings.TrimSuffix(file.Name, ".exe") != binaryName { + continue // we only want the main executable + } + + // Sanitize file name to prevent path traversal (Zip Slip). + // See: https://snyk.io/research/zip-slip-vulnerability + destPath := filepath.Join(targetDir, file.Name) + if !strings.HasPrefix(destPath, filepath.Clean(targetDir)+string(os.PathSeparator)) { + return fmt.Errorf("malicious archive entry, path traversal attempt: %s", file.Name) + } + + cleanFileName := filepath.Base(file.Name) + extractedTempPath = filepath.Join(tempDir, cleanFileName) + extractedFileName = file.Name + + if err := extractFileFromZip(file, extractedTempPath); err != nil { + return err + } + + break + } + + if extractedTempPath == "" { + return fmt.Errorf("executable not found in the zip archive") + } + + targetPath := filepath.Join(targetDir, extractedFileName) + + if err := os.Rename(extractedTempPath, targetPath); err != nil { + return fmt.Errorf("failed to move extracted file to target directory: %w", err) + } + + return nil +} + +func extractFileFromZip(file *zip.File, targetPath string) error { + rc, err := file.Open() + if err != nil { + return fmt.Errorf("failed to open file in zip: %w", err) + } + defer rc.Close() + + out, err := os.OpenFile(targetPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, file.Mode()) + if err != nil { + return fmt.Errorf("failed to create extracted file: %w", err) + } + defer out.Close() + + if _, err := io.Copy(out, rc); err != nil { + return fmt.Errorf("failed to write extracted file: %w", err) + } + + return nil +} diff --git a/pkg/dependencies/downloader_test.go b/pkg/dependencies/downloader_test.go new file mode 100644 index 0000000000..0602df4816 --- /dev/null +++ b/pkg/dependencies/downloader_test.go @@ -0,0 +1,151 @@ +// Copyright 2026 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dependencies + +import ( + "archive/zip" + "bytes" + "crypto/sha256" + "encoding/hex" + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "runtime" + "testing" +) + +func TestDownloadAndExtract(t *testing.T) { + // Create a mock zip file + var buf bytes.Buffer + zw := zip.NewWriter(&buf) + fw, _ := zw.Create("testbin") + _, _ = fw.Write([]byte("mock executable content")) + zw.Close() + zipContent := buf.Bytes() + + // Calculate checksum + hasher := sha256.New() + hasher.Write(zipContent) + checksum := hex.EncodeToString(hasher.Sum(nil)) + + binaryName := "testbin" + version := "1.0.0" + osArch := fmt.Sprintf("%s_%s", runtime.GOOS, runtime.GOARCH) + expectedChecksumKey := fmt.Sprintf("%s_%s", binaryName, osArch) + + // Inject checksum + originalChecksums := ExpectedChecksums + ExpectedChecksums = map[string]string{ + expectedChecksumKey: checksum, + } + defer func() { ExpectedChecksums = originalChecksums }() + + // Setup mock server + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write(zipContent) + })) + defer ts.Close() + + originalUrlFormat := urlFormat + urlFormat = ts.URL + "/%s/%s/%s_%s_%s.zip" + defer func() { urlFormat = originalUrlFormat }() + + targetDir := t.TempDir() + + err := downloadAndExtract(binaryName, version, targetDir) + if err != nil { + t.Fatalf("downloadAndExtract failed: %v", err) + } + + // Verify extracted file + extractedFile := filepath.Join(targetDir, "testbin") + content, err := os.ReadFile(extractedFile) + if err != nil { + t.Fatalf("failed to read extracted file: %v", err) + } + if string(content) != "mock executable content" { + t.Errorf("expected 'mock executable content', got '%s'", string(content)) + } +} + +func TestDownloadAndExtract_NoExecutable(t *testing.T) { + var buf bytes.Buffer + zw := zip.NewWriter(&buf) + fw, _ := zw.Create("otherfile") + _, _ = fw.Write([]byte("not the executable")) + zw.Close() + zipContent := buf.Bytes() + + hasher := sha256.New() + hasher.Write(zipContent) + checksum := hex.EncodeToString(hasher.Sum(nil)) + + binaryName := "testbin" + osArch := fmt.Sprintf("%s_%s", runtime.GOOS, runtime.GOARCH) + + originalChecksums := ExpectedChecksums + ExpectedChecksums = map[string]string{ + fmt.Sprintf("%s_%s", binaryName, osArch): checksum, + } + defer func() { ExpectedChecksums = originalChecksums }() + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write(zipContent) + })) + defer ts.Close() + + originalUrlFormat := urlFormat + urlFormat = ts.URL + "/%s/%s/%s_%s_%s.zip" + defer func() { urlFormat = originalUrlFormat }() + + err := downloadAndExtract(binaryName, "1.0.0", t.TempDir()) + if err == nil { + t.Fatalf("expected error due to missing executable") + } +} + +func TestVerifyChecksum_Failure(t *testing.T) { + err := verifyChecksum([]byte("bad content"), "expectedchecksum", "testbin") + if err == nil { + t.Fatalf("expected checksum verification to fail") + } +} + +func TestDownloadRelease_Failure(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer ts.Close() + + _, err := downloadRelease(ts.URL, "testbin") + if err == nil { + t.Fatalf("expected download Release to fail on 404") + } +} + +func TestDownloadAndExtract_UnsupportedOS(t *testing.T) { + originalChecksums := ExpectedChecksums + ExpectedChecksums = map[string]string{} + defer func() { ExpectedChecksums = originalChecksums }() + + err := downloadAndExtract("testbin", "1.0.0", t.TempDir()) + if err == nil { + t.Fatalf("expected unsupported OS/arch error") + } +} diff --git a/pkg/dependencies/resolver.go b/pkg/dependencies/resolver.go new file mode 100644 index 0000000000..2d49cc2f98 --- /dev/null +++ b/pkg/dependencies/resolver.go @@ -0,0 +1,109 @@ +/** +* Copyright 2026 Google LLC +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ + +package dependencies + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +type DownloadDecision int + +const ( + DownloadDecisionAsk DownloadDecision = iota + DownloadDecisionYes + DownloadDecisionNo +) + +func getBinaryCacheDir(binaryName, version string) (string, error) { + cacheDir, err := os.UserCacheDir() + if err != nil { + return "", fmt.Errorf("failed to determine user cache dir: %w", err) + } + return filepath.Join(cacheDir, "cluster-toolkit", fmt.Sprintf("%s-%s", binaryName, version)), nil +} + +// PatchPath unconditionally appends the cache directories for Terraform and Packer +// to the PATH environment variable. +func PatchPath() error { + tfCacheDir, err := getBinaryCacheDir("terraform", TerraformVersion) + if err != nil { + return err + } + + packerCacheDir, err := getBinaryCacheDir("packer", PackerVersion) + if err != nil { + return err + } + + currentPath := os.Getenv("PATH") + newPath := currentPath + string(os.PathListSeparator) + tfCacheDir + string(os.PathListSeparator) + packerCacheDir + os.Setenv("PATH", newPath) + + return nil +} + +// EnsureDependencies checks if terraform and packer are accessible in the PATH. +// If not, it handles downloading them according to the decision. +func EnsureDependencies(decision DownloadDecision) error { + if err := ensureBinary("terraform", TerraformVersion, decision); err != nil { + return err + } + if err := ensureBinary("packer", PackerVersion, decision); err != nil { + return err + } + return nil +} + +func ensureBinary(binaryName, version string, decision DownloadDecision) error { + if _, err := exec.LookPath(binaryName); err == nil { + return nil + } + + if err := confirmDownload(binaryName, version, decision); err != nil { + return err + } + + binaryCacheDir, err := getBinaryCacheDir(binaryName, version) + if err != nil { + return err + } + + return downloadAndExtract(binaryName, version, binaryCacheDir) +} + +func confirmDownload(binaryName, version string, decision DownloadDecision) error { + if decision == DownloadDecisionNo { + return fmt.Errorf("%s is missing. Download is explicitly disabled. Enable download by specifying --download-dependencies flag.", binaryName) + } + + if decision == DownloadDecisionAsk { + fmt.Printf("%s v%s is missing. Do you want to download it? [y/N]: ", binaryName, version) + reader := bufio.NewReader(os.Stdin) + response, _ := reader.ReadString('\n') + response = strings.TrimSpace(strings.ToLower(response)) + if response != "y" && response != "yes" { + return fmt.Errorf("user declined to download %s", binaryName) + } + } + + return nil +} diff --git a/pkg/dependencies/resolver_test.go b/pkg/dependencies/resolver_test.go new file mode 100644 index 0000000000..48c82b86ef --- /dev/null +++ b/pkg/dependencies/resolver_test.go @@ -0,0 +1,158 @@ +/** +* Copyright 2026 Google LLC +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ + +package dependencies + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestPatchPath(t *testing.T) { + oldPath := os.Getenv("PATH") + defer os.Setenv("PATH", oldPath) + + err := PatchPath() + if err != nil { + t.Fatalf("PatchPath() failed: %v", err) + } + + newPath := os.Getenv("PATH") + + cacheDir, err := os.UserCacheDir() + if err != nil { + t.Fatalf("UserCacheDir() failed: %v", err) + } + + expectedTfPath := filepath.Join(cacheDir, "cluster-toolkit", fmt.Sprintf("terraform-%s", TerraformVersion)) + expectedPackerPath := filepath.Join(cacheDir, "cluster-toolkit", fmt.Sprintf("packer-%s", PackerVersion)) + + if !strings.Contains(newPath, expectedTfPath) { + t.Errorf("Expected PATH to contain %s, got %s", expectedTfPath, newPath) + } + if !strings.Contains(newPath, expectedPackerPath) { + t.Errorf("Expected PATH to contain %s, got %s", expectedPackerPath, newPath) + } + if !strings.HasPrefix(newPath, oldPath) { + t.Errorf("Expected new PATH to start with old PATH") + } +} + +func TestEnsureBinary_MissingAndDecisionNo(t *testing.T) { + binaryName := "fake-binary-that-does-not-exist" + + err := ensureBinary(binaryName, "1.0.0", DownloadDecisionNo) + if err == nil { + t.Fatalf("Expected error when binary is missing and decision is No") + } + expectedErrMsg := fmt.Sprintf("%s is missing. Download is explicitly disabled. Enable download by specifying --download-dependencies flag.", binaryName) + if err.Error() != expectedErrMsg { + t.Errorf("Expected error %q, got %q", expectedErrMsg, err.Error()) + } +} + +func TestConfirmDownload_Ask_Yes(t *testing.T) { + oldStdin := os.Stdin + r, w, _ := os.Pipe() + os.Stdin = r + defer func() { os.Stdin = oldStdin }() + + _, _ = w.Write([]byte("yes\n")) + w.Close() + + err := confirmDownload("testbin", "1.0.0", DownloadDecisionAsk) + if err != nil { + t.Fatalf("expected no error for Ask(yes), got %v", err) + } +} + +func TestConfirmDownload_Ask_No(t *testing.T) { + oldStdin := os.Stdin + r, w, _ := os.Pipe() + os.Stdin = r + defer func() { os.Stdin = oldStdin }() + + _, _ = w.Write([]byte("no\n")) + w.Close() + + err := confirmDownload("testbin", "1.0.0", DownloadDecisionAsk) + if err == nil { + t.Fatalf("expected error for Ask(no)") + } +} + +func TestConfirmDownload_Yes(t *testing.T) { + err := confirmDownload("testbin", "1.0.0", DownloadDecisionYes) + if err != nil { + t.Fatalf("expected no error for DownloadDecisionYes, got %v", err) + } +} + +func TestEnsureBinary_Exists(t *testing.T) { + tempDir := t.TempDir() + binaryName := "fake-existing-binary" + + f, err := os.Create(filepath.Join(tempDir, binaryName)) + if err != nil { + t.Fatalf("failed to create fake binary: %v", err) + } + _ = f.Chmod(0755) + f.Close() + + oldPath := os.Getenv("PATH") + defer os.Setenv("PATH", oldPath) + os.Setenv("PATH", tempDir+string(os.PathListSeparator)+oldPath) + + err = ensureBinary(binaryName, "1.0.0", DownloadDecisionNo) + if err != nil { + t.Fatalf("expected no error when binary exists in PATH, got %v", err) + } +} + +func TestEnsureDependencies_Exists(t *testing.T) { + tempDir := t.TempDir() + + tf, _ := os.Create(filepath.Join(tempDir, "terraform")) + _ = tf.Chmod(0755) + tf.Close() + + packer, _ := os.Create(filepath.Join(tempDir, "packer")) + _ = packer.Chmod(0755) + packer.Close() + + oldPath := os.Getenv("PATH") + defer os.Setenv("PATH", oldPath) + os.Setenv("PATH", tempDir+string(os.PathListSeparator)+oldPath) + + err := EnsureDependencies(DownloadDecisionNo) + if err != nil { + t.Fatalf("expected no error when dependencies exist, got %v", err) + } +} + +func TestEnsureDependencies_Missing(t *testing.T) { + oldPath := os.Getenv("PATH") + defer os.Setenv("PATH", oldPath) + os.Setenv("PATH", t.TempDir()) // Empty PATH basically + + err := EnsureDependencies(DownloadDecisionNo) + if err == nil { + t.Fatalf("expected error when dependencies are missing and decision is No") + } +} diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index a09c844ea5..e6cf9f607a 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -241,12 +241,9 @@ func tfDeploymentSource(mod config.Module) (string, error) { case sourcereader.IsEmbeddedPath(mod.Source): return "./modules/" + filepath.Join("embedded", mod.Source), nil case sourcereader.IsLocalPath(mod.Source): - abs, err := filepath.Abs(mod.Source) - if err != nil { - return "", fmt.Errorf("failed to get absolute path for %#v: %v", mod.Source, err) - } - base := filepath.Base(mod.Source) - return fmt.Sprintf("./modules/%s-%s", base, shortHash(abs)), nil + clean := filepath.Clean(mod.Source) + base := filepath.Base(clean) + return fmt.Sprintf("./modules/%s-%s", base, shortHash(clean)), nil default: return mod.Source, nil } diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index ebf70d65b0..f79b4c6c44 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -548,6 +548,15 @@ func (s *zeroSuite) TestDeploymentSource(c *C) { c.Check(err, IsNil) c.Check(s, Matches, `^\./modules/y-\w\w\w\w$`) } + { // equivalent paths produce same hash after cleaning + m1 := config.Module{Kind: config.TerraformKind, Source: "./modules/x/y"} + m2 := config.Module{Kind: config.TerraformKind, Source: "./modules/x/z/../y"} + s1, err1 := DeploymentSource(m1) + c.Check(err1, IsNil) + s2, err2 := DeploymentSource(m2) + c.Check(err2, IsNil) + c.Check(s1, Equals, s2) + } } func (s *zeroSuite) TestSubstituteIgcReferencesInModule(c *C) { diff --git a/pkg/validators/cloud.go b/pkg/validators/cloud.go index 6bd08e63b9..4d7ec8467c 100644 --- a/pkg/validators/cloud.go +++ b/pkg/validators/cloud.go @@ -254,25 +254,74 @@ func testZoneInRegion(bp config.Blueprint, inputs config.Dict) error { return TestZoneInRegion(m["project_id"], m["zone"], m["region"]) } -// findReservationInOtherZones searches for a reservation by name across zones -// in the project. -func findReservationInOtherZones(s *compute.Service, projectID string, name string) ([]string, error) { - aggList, err := s.Reservations.AggregatedList(projectID).Do() - if err != nil { - return nil, err - } +// Helper interface to treat Standard and Future reservations generically +type zoneResource interface { + GetName() string + GetZone() string +} + +// Wrapper for compute.Reservation +type stdRes struct{ *compute.Reservation } + +func (r stdRes) GetName() string { return r.Name } +func (r stdRes) GetZone() string { return r.Zone } + +// Wrapper for compute.FutureReservation +type futRes struct{ *compute.FutureReservation } + +func (r futRes) GetName() string { return r.Name } +func (r futRes) GetZone() string { return r.Zone } +func extractZonesFromItems[T any](items map[string]T, name string, extractor func(T) []zoneResource) []string { foundInZones := []string{} - for _, scopedList := range aggList.Items { - for _, res := range scopedList.Reservations { - if res.Name == name { - // res.Zone is a full URL, extract just the name (e.g., "us-central1-a") - parts := strings.Split(res.Zone, "/") + for _, scopedList := range items { + for _, res := range extractor(scopedList) { + if res.GetName() == name { + parts := strings.Split(res.GetZone(), "/") foundInZones = append(foundInZones, parts[len(parts)-1]) } } } - return foundInZones, nil + return foundInZones +} + +func findReservationInOtherZones(ctx context.Context, s *compute.Service, projectID string, name string) ([]string, error) { + // 1. Search Standard Zonal Reservations + aggList, err := s.Reservations.AggregatedList(projectID).Context(ctx).Do() + if err == nil { + found := extractZonesFromItems(aggList.Items, name, func(l compute.ReservationsScopedList) []zoneResource { + res := make([]zoneResource, len(l.Reservations)) + for i, r := range l.Reservations { + res[i] = stdRes{r} + } + return res + }) + if len(found) > 0 { + return found, nil + } + } + + // 2. Search Future Reservations (Early return if Standard found, otherwise search here) + fAggList, fErr := s.FutureReservations.AggregatedList(projectID).Context(ctx).Do() + if fErr == nil { + found := extractZonesFromItems(fAggList.Items, name, func(l compute.FutureReservationsScopedList) []zoneResource { + res := make([]zoneResource, len(l.FutureReservations)) + for i, r := range l.FutureReservations { + res[i] = futRes{r} + } + return res + }) + if len(found) > 0 { + return found, nil + } + } + + // If both failed and we found nothing, return the errors + if err != nil || fErr != nil { + return nil, fmt.Errorf("failed to list standard reservations: %v; failed to list future reservations: %v", err, fErr) + } + + return []string{}, nil } // TestReservationExists checks if a reservation exists in a project and zone. @@ -286,21 +335,35 @@ func TestReservationExists(ctx context.Context, reservationProjectID string, zon return handleClientError(err) } - // 1. Direct check: Try to Get the specific reservation - _, err = s.Reservations.Get(reservationProjectID, zone, reservationName).Do() + // 1. Direct check: Try Standard Zonal Reservation + _, err = s.Reservations.Get(reservationProjectID, zone, reservationName).Context(ctx).Do() if err == nil { - return nil // Success + return nil } - // 2. Access Check: If we can't even reach the project/API, issue soft warning + // 2. Fallback: Try Future Reservation (Required for Blackwell/A4 hardware) + _, fErr := s.FutureReservations.Get(reservationProjectID, zone, reservationName).Context(ctx).Do() + if fErr == nil { + return nil + } + + // 3. Access Check: If both failed, check for metadata blindness (403/400). + // We handle this because users might be allowed to CONSUME but not DESCRIBE a shared reservation. + // Case A: Standard API Access Check if msg, isSoft := getSoftWarningMessage(err, "test_reservation_exists", reservationProjectID, "Compute Engine API", "compute.reservations.get"); isSoft { fmt.Println(msg) - return nil // Skip and continue + return nil + } + + // Case B: Future API Access Check + if msg, isSoft := getSoftWarningMessage(fErr, "test_reservation_exists", reservationProjectID, "Compute Engine API", "compute.futureReservations.get"); isSoft { + fmt.Println(msg) + return nil } - // 3. Diagnostic Search: The reservation was not in the expected zone (404). + // 4. Diagnostic Search: The reservation was not in the expected zone (404). // We try to find where it actually is. - foundInZones, aggErr := findReservationInOtherZones(s, reservationProjectID, reservationName) + foundInZones, aggErr := findReservationInOtherZones(ctx, s, reservationProjectID, reservationName) if aggErr != nil { // If Discovery fails (403/400) and it's a SHARED project, we must skip @@ -320,7 +383,7 @@ func TestReservationExists(ctx context.Context, reservationProjectID string, zon return fmt.Errorf("reservation %q not found in project %q and zone %q", reservationName, reservationProjectID, zone) } - // 4. Resource Found Discovery: If we found it elsewhere, provide a Hard Failure with Hint. + // 5. Resource Found Discovery: Provide Hint if len(foundInZones) > 0 { zonesList := strings.Join(foundInZones, ", ") return config.HintError{ @@ -331,7 +394,7 @@ func TestReservationExists(ctx context.Context, reservationProjectID string, zon } } - // 5. Not Found Anywhere: Hard Failure + // 6. Not Found Anywhere: Hard Failure return fmt.Errorf("reservation %q was not found in any zone of project %q", reservationName, reservationProjectID) } @@ -368,7 +431,7 @@ func testReservationExists(bp config.Blueprint, inputs config.Dict) error { targetName = matches[2] } - // Pass both the owner project and the deployment project + // Pass context from the caller to ensure cancellation/timeouts are respected ctx := context.Background() return TestReservationExists(ctx, reservationProjectID, zone, targetName, deploymentProjectID) } diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index d3f1176434..fb28ceb58b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -123,7 +123,7 @@ register: gcluster_destroy changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve --robust args: chdir: "{{ workspace }}" environment: @@ -184,7 +184,7 @@ register: gcluster_destroy changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve --robust args: chdir: "{{ workspace }}" environment: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index e5e0127dd4..30923c1fd1 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -38,7 +38,7 @@ register: gcluster_destroy changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve --robust args: chdir: "{{ workspace }}" environment: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml index 35ba4e28e1..a68fef2811 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml @@ -37,7 +37,7 @@ register: gcluster_destroy changed_when: gcluster_destroy.changed run_once: true - ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve --robust # Temporarily add retry due to K8S deletion issue. until: gcluster_destroy.rc == 0 retries: 1 diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/fio-test-job.yaml.j2 b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/fio-test-job.yaml.j2 new file mode 100644 index 0000000000..a6e7503e5c --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/fio-test-job.yaml.j2 @@ -0,0 +1,52 @@ +{# +Copyright 2026 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +#} + +apiVersion: v1 +kind: Pod +metadata: + name: fio + namespace: default + annotations: + gke-gcsfuse/volumes: "true" +spec: + containers: + - name: fio + image: mayadata/fio + command: ["/bin/ash", "-c", "--"] + args: + - | + fio --name=read_latency_test --filename=/data/fio --filesize=1G --time_based=1 --ramp_time=10s --runtime=1m --ioengine=libaio --direct=1 --verify=0 --randrepeat=0 --bs=4K --iodepth=1 --rw=randread --disable_slat=1 --disable_clat=1 --lat_percentiles=1 --numjobs=1 --output-format=json + volumeMounts: + - name: fio-bucket + mountPath: /data + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - "{{ cache_zone }}" + serviceAccountName: "{{ k8s_service_account_name }}" + volumes: + - name: fio-bucket + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ bucket_name }}" + gcsfuseLoggingSeverity: warning + restartPolicy: Never diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-slurm-rapid-storage.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-slurm-rapid-storage.yml new file mode 100644 index 0000000000..c8dbf93e94 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-slurm-rapid-storage.yml @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Get outputs from primary group + delegate_to: localhost + ansible.builtin.command: + cmd: terraform output -json + chdir: "{{ workspace }}/{{ deployment_name }}/primary" + register: outputs + changed_when: false + +- name: Set necessary facts + delegate_to: localhost + ansible.builtin.set_fact: + bucket_name: "{{ (outputs.stdout | from_json)['gcs_bucket_name_zonal-gcs-bucket']['value'] }}" + cache_zone: "{{ (lookup('file', blueprint_yaml) | from_yaml)['vars']['zone'] }}" + gcs_bucket_local_mount: "{{ (lookup('file', blueprint_yaml) | from_yaml)['vars']['gcs_bucket_local_mount'] }}" + +- name: Get bucket metadata + delegate_to: localhost + ansible.builtin.command: gcloud storage buckets describe gs://{{ bucket_name }} --format=json + register: bucket_metadata + changed_when: false + +- name: Assert storage class + delegate_to: localhost + ansible.builtin.assert: + that: + - (bucket_metadata.stdout | from_json)['default_storage_class'] == 'RAPID' + +- name: Assert placement zone + delegate_to: localhost + ansible.builtin.assert: + that: + - ((bucket_metadata.stdout | from_json)['data_locations'][0] | lower) == (cache_zone | lower) + +- name: Wait for Anywhere Cache creation + delegate_to: localhost + ansible.builtin.shell: | + gcloud alpha storage operations list gs://{{ bucket_name }} --format=json | jq -e '.[] | select(.metadata.commonMetadata.type == "create-anywhere-cache" and .done == true and (has("error") | not))' + register: cache_operation + until: cache_operation.rc == 0 + retries: 6 + delay: 10 + changed_when: false + +- name: Install FIO + ansible.builtin.package: + name: fio + state: present + become: true + +- name: Run FIO test + ansible.builtin.shell: | + fio --name=randread_latency_test --filename="{{ gcs_bucket_local_mount }}/fio" --filesize=1G --time_based=1 --ramp_time=10s --runtime=1m --ioengine=libaio --direct=1 --verify=0 --randrepeat=0 --bs=4K --iodepth=1 --rw=randread --disable_slat=1 --disable_clat=1 --lat_percentiles=1 --numjobs=1 --group_reporting --output-format=json > fio_output.json + args: + chdir: "{{ gcs_bucket_local_mount }}" + become: true + +- name: Fetch FIO output + ansible.builtin.fetch: + src: "{{ gcs_bucket_local_mount }}/fio_output.json" + dest: "{{ workspace }}/fio_output.json" + flat: yes + become: true + +- name: Read FIO results from fetched file + delegate_to: localhost + ansible.builtin.set_fact: + fio_results: "{{ lookup('file', workspace + '/fio_output.json') | from_json }}" + +- name: Print FIO results + delegate_to: localhost + ansible.builtin.debug: + var: fio_results + +- name: Assert FIO read performance thresholds + delegate_to: localhost + ansible.builtin.assert: + that: + - fio_results.jobs[0].read.bw_mean >= 1000 + - fio_results.jobs[0].read.iops_mean >= 300 + fail_msg: "FIO read performance thresholds not met. Bandwidth: {{ fio_results.jobs[0].read.bw_mean }} KB/s (expected >= 1000 KB/s), IOPS: {{ fio_results.jobs[0].read.iops_mean }} (expected >= 300)." + success_msg: "FIO read performance thresholds met. Bandwidth: {{ fio_results.jobs[0].read.bw_mean }} KB/s, IOPS: {{ fio_results.jobs[0].read.iops_mean }}." diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-zonal-bucket.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-zonal-bucket.yml new file mode 100644 index 0000000000..c40f88dea1 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-zonal-bucket.yml @@ -0,0 +1,92 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Get outputs from primary group + delegate_to: localhost + ansible.builtin.command: + cmd: terraform output -json + chdir: "{{ workspace }}/{{ deployment_name }}/primary" + register: outputs + changed_when: false + +- name: Set necessary facts + delegate_to: localhost + ansible.builtin.set_fact: + bucket_name: "{{ (outputs.stdout | from_json)['gcs_bucket_name_data-bucket-zonal']['value'] }}" + cache_zone: "{{ (lookup('file', blueprint_yaml) | from_yaml)['vars']['zone'] }}" + k8s_service_account_name: "{{ (outputs.stdout | from_json)['k8s_service_account_name_gke_cluster']['value'] }}" + +- name: Get bucket metadata + delegate_to: localhost + ansible.builtin.command: gcloud storage buckets describe gs://{{ bucket_name }} --format=json + register: bucket_metadata + changed_when: false + +- name: Assert storage class + delegate_to: localhost + ansible.builtin.assert: + that: + - (bucket_metadata.stdout | from_json)['default_storage_class'] == 'RAPID' + +- name: Render FIO Kubernetes Pod manifest + ansible.builtin.template: + src: fio-test-job.yaml.j2 + dest: "{{ workspace }}/fio-test-job.yaml" + delegate_to: localhost + +- name: Get cluster credentials for kubectl + delegate_to: localhost + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ region }} --project {{ custom_vars.project }} + +- name: Deploy FIO Kubernetes Pod using kubectl apply + ansible.builtin.command: kubectl apply -f {{ workspace }}/fio-test-job.yaml + delegate_to: localhost + +- name: Wait for FIO Pod to complete + ansible.builtin.command: kubectl wait --for=condition=Succeeded pod/fio --namespace default --timeout=900s + delegate_to: localhost + ignore_errors: true # Even if we timeout, continue to next steps + +- name: Fetch FIO logs + ansible.builtin.command: kubectl logs pod/fio --namespace default + register: fio_pod_logs + delegate_to: localhost + +- name: Delete FIO Pod + ansible.builtin.command: kubectl delete pod/fio --namespace default + delegate_to: localhost + ignore_errors: true + +- name: Parse FIO results + delegate_to: localhost + ansible.builtin.set_fact: + fio_results: "{{ fio_pod_logs.stdout | from_json }}" + +- name: Print FIO results + delegate_to: localhost + ansible.builtin.debug: + msg: + - "fio_results.jobs[0].read.bw_mean: {{ fio_results.jobs[0].read.bw_mean }}" + - "Type of bw: {{ fio_results.jobs[0].read.bw_mean | type_debug }}" + - "fio_results.jobs[0].read.iops_mean: {{ fio_results.jobs[0].read.iops_mean }}" + - "Type of iops: {{ fio_results.jobs[0].read.iops_mean | type_debug }}" + +- name: Assert FIO results + delegate_to: localhost + ansible.builtin.assert: + that: + - fio_results.jobs[0].read.bw_mean >= 1000 + - fio_results.jobs[0].read.iops_mean >= 300 + fail_msg: "FIO read performance thresholds not met. Bandwidth: {{ fio_results.jobs[0].read.bw_mean }} KB/s (expected >= 1000 KB/s), IOPS: {{ fio_results.jobs[0].read.iops_mean }} (expected >= 300)." + success_msg: "FIO read performance thresholds met. Bandwidth: {{ fio_results.jobs[0].read.bw_mean }} KB/s, IOPS: {{ fio_results.jobs[0].read.iops_mean }}." diff --git a/tools/cloud-build/daily-tests/builds/e2e.yaml b/tools/cloud-build/daily-tests/builds/e2e.yaml index 6d822fb7ef..376322d35f 100644 --- a/tools/cloud-build/daily-tests/builds/e2e.yaml +++ b/tools/cloud-build/daily-tests/builds/e2e.yaml @@ -48,4 +48,4 @@ steps: # check instance was created gcloud compute instances describe "${depl_name}-0" --project="$PROJECT_ID" --zone="$zone" >/dev/null - ./gcluster destroy "$depl_name" --auto-approve + ./gcluster destroy "$depl_name" --auto-approve --robust diff --git a/tools/cloud-build/daily-tests/builds/gcluster-dockerfile.yaml b/tools/cloud-build/daily-tests/builds/gcluster-dockerfile.yaml index c6d6c4da21..5ecbe717d1 100644 --- a/tools/cloud-build/daily-tests/builds/gcluster-dockerfile.yaml +++ b/tools/cloud-build/daily-tests/builds/gcluster-dockerfile.yaml @@ -68,4 +68,4 @@ steps: - id: destroy-resources name: 'gcluster' - args: ['destroy', '/workspace/dockerfile-test', '--auto-approve'] + args: ['destroy', '/workspace/dockerfile-test', '--auto-approve', '--robust'] diff --git a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue.yaml b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue.yaml deleted file mode 100644 index 41595f04ba..0000000000 --- a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue.yaml +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -tags: -- m.gke-cluster -- m.gke-node-pool -- m.service-account -- m.vpc -- m.multivpc -- m.kubectl-apply -- gke - -substitutions: - _TEST_PREFIX: "" # Default to no prefix - -timeout: 14400s # 4hr -steps: -# While using static network names we are guarding against more than 1 instance running at a time (for multi-group tests) -- id: check_for_running_build - name: gcr.io/cloud-builders/gcloud - script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue.yaml" - -- id: gke-a2-highgpu-kueue-test - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - cd /workspace - if [ "${_TEST_PREFIX}" == "daily-" ]; then - gsutil cp gs://$${GCLUSTER_GCS_PATH}/latest/gcluster-bundle.zip . - unzip -o gcluster-bundle.zip - # Grant execution permissions to the binary - chmod +x gcluster - else - make - fi - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml - - echo ' - id: remote-node' >> $${EXAMPLE_BP} - echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP} - echo ' use: [network1]' >> $${EXAMPLE_BP} - echo ' settings:' >> $${EXAMPLE_BP} - echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP} - echo ' name_prefix: remote-node' >> $${EXAMPLE_BP} - echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} - - bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] -availableSecrets: - secretManager: - - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest - env: 'GCLUSTER_GCS_PATH' diff --git a/tools/cloud-build/daily-tests/builds/gke-a4.yaml b/tools/cloud-build/daily-tests/builds/gke-a4.yaml deleted file mode 100644 index 3a9b858899..0000000000 --- a/tools/cloud-build/daily-tests/builds/gke-a4.yaml +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -tags: -- m.gke-job-template -- gke -- m.gke-cluster -- m.gke-node-pool -- m.service-account -- m.gpu-rdma-vpc -- m.kubectl-apply -- m.vpc -- m.cloud-storage-bucket -- m.gke-persistent-volume -- m.pre-existing-network-storage - -substitutions: - _TEST_PREFIX: "" # Default to no prefix - -timeout: 14400s # 4hr -steps: -# While using static network names we are guarding against more than 1 instance running at a time (for multi-group tests) -- id: check_for_running_build - name: gcr.io/cloud-builders/gcloud - script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/gke-a4.yaml" - -- id: gke-a4 - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - cd /workspace - if [ "${_TEST_PREFIX}" == "daily-" ]; then - gsutil cp gs://$${GCLUSTER_GCS_PATH}/latest/gcluster-bundle.zip . - unzip -o gcluster-bundle.zip - # Grant execution permissions to the binary - chmod +x gcluster - else - make - fi - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - EXAMPLE_BP=examples/gke-a4/gke-a4.yaml - - - # adding vm to act as remote node - echo ' - id: remote-node' >> $${EXAMPLE_BP} - echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP} - echo ' use: [gke-a4-net-0]' >> $${EXAMPLE_BP} - echo ' settings:' >> $${EXAMPLE_BP} - echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP} - echo ' name_prefix: remote-node' >> $${EXAMPLE_BP} - echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} - echo '' - echo ' - id: job_template_hostname' >> $${EXAMPLE_BP} - echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP} - echo ' use: [a4-pool]' >> $${EXAMPLE_BP} - echo ' settings:' >> $${EXAMPLE_BP} - echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP} - echo ' command:' >> $${EXAMPLE_BP} - echo ' - nvidia-smi' >> $${EXAMPLE_BP} - echo ' node_count: 1' >> $${EXAMPLE_BP} - echo ' outputs: [instructions]' >> $${EXAMPLE_BP} - - bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a4.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] -availableSecrets: - secretManager: - - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest - env: 'GCLUSTER_GCS_PATH' diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index c5928b9892..05bb63d03b 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -15,6 +15,8 @@ --- tags: - m.cloud-storage-bucket +- gcs-rapid +- anywhere-cache - m.filestore - m.gke-cluster - m.service-account @@ -64,7 +66,7 @@ steps: echo ' use: [network1]' >> $${SG_EXAMPLE} echo ' settings:' >> $${SG_EXAMPLE} echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} - echo ' zone: us-central1-a' >> $${SG_EXAMPLE} + echo ' zone: us-central1-b' >> $${SG_EXAMPLE} bash tools/add_ttl_label.sh "$${SG_EXAMPLE}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml index c0f36ac76f..41adb41ff9 100644 --- a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml @@ -83,6 +83,7 @@ steps: echo "INFO: Using $${H4D_VARS_FILE} as it is for SPOT provisioning." fi bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: cluster-net-0/,/- id: cluster-rdma-net-0/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index 95ff82d628..f9c3e90b86 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -86,6 +86,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: sysnet/,/- id: gpunets/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index b903a3058d..74e917d617 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -64,6 +64,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: sysnet/,/- id: gpunets/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml index d94afd0891..cd83af290f 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml @@ -26,6 +26,7 @@ tags: - m.vpc - slurm6 + timeout: 14400s # 4hr steps: # While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests) @@ -76,6 +77,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a3ultra-slurm-net-0/,/- id: a3ultra-slurm-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml index 977b2890b5..678b9eef7a 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml @@ -58,6 +58,8 @@ steps: BLUEPRINT="/workspace/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml" sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT} sed -i -e '/reason:/d' $${BLUEPRINT} + bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a3ultra-net-0/,/- id: a3ultra-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml index 449b90e8f7..7fb13d7b4c 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml @@ -84,6 +84,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a3ultra-net-0/,/- id: a3ultra-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index ca6083424b..da1706136e 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -89,7 +89,8 @@ steps: else echo "INFO: Using $${SLURM_VARS_FILE} as it is for SPOT provisioning." fi - + bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a3ultra-slurm-net-0/,/- id: a3ultra-slurm-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml index 5193d65f33..10d3ca3118 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml @@ -63,6 +63,8 @@ steps: BLUEPRINT="/workspace/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml" sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT} sed -i -e '/reason:/d' $${BLUEPRINT} + bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a3ultra-slurm-net-0/,/- id: a3ultra-slurm-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml index cbe48a5896..87397aa5b3 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml @@ -26,6 +26,7 @@ tags: - m.vpc - slurm6 + timeout: 14400s # 4hr steps: # While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests) @@ -63,6 +64,7 @@ steps: BLUEPRINT="/workspace/tools/cloud-build/daily-tests/blueprints/a4high-custom-image-blueprint.yaml" sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT} sed -i -e '/reason:/d' $${BLUEPRINT} + bash tools/add_ttl_label.sh $${BLUEPRINT} ENABLE_SPOT="true" VARS_FILE="tools/cloud-build/daily-tests/tests/ml-a4-highgpu-custom-blueprint-test.yml" @@ -74,6 +76,7 @@ steps: echo "INFO: Using $${VARS_FILE} as it is for SPOT provisioning." fi + sed -i -e '/- id: a4high-slurm-net-0/,/- id: a4high-slurm-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index 115ccbe0f5..807a6d9dc9 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -89,6 +89,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: a4high-slurm-net-0/,/- id: a4high-slurm-net-1/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml index 588073e6c0..6036182df0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml @@ -81,6 +81,8 @@ steps: echo "INFO: Using $${SLURM_VARS_FILE} as it is for SPOT provisioning." fi + bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: net0/,/- id: homefs/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml index b5f76138ec..778ffd8c66 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml @@ -70,7 +70,7 @@ steps: IP=$(curl ifconfig.me) sed -i "s//$${IP}/" $${SG_EXAMPLE} - + bash tools/add_ttl_label.sh $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml" diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index d853ff996f..4afb0edb8d 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -70,6 +70,7 @@ steps: IP=$(curl ifconfig.me) sed -i "s//$${IP}/" $${SG_EXAMPLE} + bash tools/add_ttl_label.sh $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml index 7cd3937a06..f3f010e2e7 100644 --- a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml @@ -80,6 +80,8 @@ steps: echo "INFO: Using $${SLURM_VARS_FILE} as it is for SPOT provisioning." fi + bash tools/add_ttl_label.sh $${BLUEPRINT} + sed -i -e '/- id: h4d-slurm-net-0/,/- id: h4d-rdma-net/ s/network_name: .*/network_name: $(vars.base_network_name)/' $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml index 4a97d3371c..5bc6bdcec3 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml @@ -58,6 +58,8 @@ steps: fi BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + BLUEPRINT="examples/ml-slurm.yaml" + bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml similarity index 62% rename from tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm.yaml rename to tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml index 1fbd4d04dc..bacbc45c8a 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml @@ -14,30 +14,24 @@ --- tags: -- m.custom-image +- gcs-rapid +- anywhere-cache +- m.vpc - m.cloud-storage-bucket -- m.pre-existing-network-storage -- m.filestore -- m.gpu-rdma-vpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login - m.schedmd-slurm-gcp-v6-nodeset - m.schedmd-slurm-gcp-v6-partition -- m.startup-script -- m.vpc -- slurm6 - -substitutions: - _TEST_PREFIX: "" # Default to no prefix timeout: 14400s # 4hr steps: # While using static network names we are guarding against more than 1 instance running at a time (for multi-group tests) - id: check_for_running_build name: gcr.io/cloud-builders/gcloud - script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/ml-a4-highgpu-slurm.yaml" + script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml" -- id: ml-a4-highgpu-slurm +## Test SLURM Rapid Storage +- id: slurm-rapid-storage name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: @@ -58,19 +52,8 @@ steps: fi BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - REGION=europe-west1 - ZONE=europe-west1-b - BLUEPRINT="/workspace/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml" - sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT} - sed -i -e 's/\breservation_name\b/future_reservation/g' $${BLUEPRINT} - sed -i -e '/reason:/d' $${BLUEPRINT} + BLUEPRINT=examples/rapid-storage-slurm.yaml + bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] -availableSecrets: - secretManager: - - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest - env: 'GCLUSTER_GCS_PATH' + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml" diff --git a/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml b/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml deleted file mode 100644 index 5520c5ffea..0000000000 --- a/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -# region, zone must be defined -# in build file with --extra-vars flag! -test_name: gke-a2high-kueue -deployment_name: gke-a2high-{{ build }} -workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml" -network: "gke-a2high-net-{{ build }}" -region: us-central1 -zone: us-central1-f -remote_node: "{{ deployment_name }}-remote-node-0" -reservation_affinity: - consume_reservation_type: SPECIFIC_RESERVATION - specific_reservations: - - name: a2-reservation-0 - project: "{{ project }}" -cli_deployment_vars: - region: "{{ region }}" - zone: "{{ zone }}" - network_name: "{{ network }}" - reservation_affinity: "{{ reservation_affinity }}" - local_ssd_count_nvme_block: 2 -custom_vars: - project: "{{ project }}" -post_deploy_tests: -- test-validation/test-gke-kueue.yml diff --git a/tools/cloud-build/daily-tests/tests/gke-a4.yml b/tools/cloud-build/daily-tests/tests/gke-a4.yml deleted file mode 100644 index 012e99c983..0000000000 --- a/tools/cloud-build/daily-tests/tests/gke-a4.yml +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -# region, zone must be defined -# in build file with --extra-vars flag! -test_name: gke-a4 -deployment_name: gke-a4-{{ build }} -workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-a4/gke-a4.yaml" -network: "{{ deployment_name }}-net-0" -region: us-central1 -zone: us-central1-b -remote_node: "{{ deployment_name }}-remote-node-0" -extended_reservation: nvidia-b200-412c63fb-4e68-46d2-a19e-52b7e468ca7b -static_node_count: 2 -instance_type: a4 -accelerator_type: nvidia-b200 -num_gpus: 16 -k8s_service_account_name: workload-identity-k8s-sa -cli_deployment_vars: - region: "{{ region }}" - zone: "{{ zone }}" - static_node_count: "{{ static_node_count }}" - reservation: "{{ extended_reservation }}" - authorized_cidr: "{{ build_ip.stdout }}/32" - gcp_public_cidrs_access_enabled: true - k8s_service_account_name: "{{ k8s_service_account_name}}" -custom_vars: - project: "{{ project }}" -post_deploy_tests: -- test-validation/test-gke-job.yml -- test-validation/test-gke-a4.yml -- test-validation/test-gke-kueue-config.yml diff --git a/tools/cloud-build/daily-tests/tests/gke-storage.yml b/tools/cloud-build/daily-tests/tests/gke-storage.yml index a605a29861..d774ca14ee 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage.yml @@ -15,15 +15,17 @@ test_name: storage-gke deployment_name: gke-storage-{{ build }} region: us-central1 -zone: us-central1-a # for remote node +zone: us-central1-b # for remote node workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/storage-gke.yaml" network: "{{ test_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: +- test-validation/test-zonal-bucket.yml - test-validation/test-anywhere-cache.yml cli_deployment_vars: region: "{{ region }}" + zone: "{{ zone }}" network_name: "{{ network }}" authorized_cidr: "{{ build_ip.stdout }}/32" gcp_public_cidrs_access_enabled: true diff --git a/tools/cloud-build/daily-tests/tests/h4d-vm.yml b/tools/cloud-build/daily-tests/tests/h4d-vm.yml index 9e7c51ac61..b31b181e28 100644 --- a/tools/cloud-build/daily-tests/tests/h4d-vm.yml +++ b/tools/cloud-build/daily-tests/tests/h4d-vm.yml @@ -19,7 +19,7 @@ test_name: h4d-jbvms deployment_name: h4d-jbvms-{{ build }} workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/h4d-vm.yaml" -network: "{{ test_name }}-net" +network: "{{ test_name }}" remote_node: "{{ deployment_name }}-0" post_deploy_tests: - test-validation/test-irdma.yml @@ -31,6 +31,6 @@ custom_vars: h4d_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" - base_network_name: "{{ test_name }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-onspot-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-onspot-slurm.yml index 3a8dfc1c12..dacac701e3 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-onspot-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-onspot-slurm.yml @@ -23,7 +23,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ deployment_name }}-net-0" +network: "{{ test_name }}" nccl_test_path: "examples/machine-learning/a3-highgpu-8g/nccl-tests" sub_network: "{{ deployment_name }}-sub-0" post_deploy_tests: @@ -46,6 +46,7 @@ custom_vars: a3high_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" tcpx_kernel_login: "{{ tcpx_kernel_login }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml index 5aea242458..cf5305ba42 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml @@ -23,7 +23,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ deployment_name }}-net-0" +network: "{{ test_name }}" nccl_test_path: "examples/machine-learning/a3-highgpu-8g/nccl-tests" sub_network: "{{ deployment_name }}-sub-0" post_deploy_tests: @@ -43,6 +43,7 @@ custom_vars: mounts: - /home cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" tcpx_kernel_login: "{{ tcpx_kernel_login }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-custom-blueprint-test.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-custom-blueprint-test.yml index 4974ab355c..d08c9de2c8 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-custom-blueprint-test.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-custom-blueprint-test.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/a3ultra-custom-image-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" nccl_test_path: "examples/machine-learning/a3-ultragpu-8g/nccl-tests" post_deploy_tests: - test-validation/test-mounts.yml @@ -46,6 +46,7 @@ custom_vars: a3ultra_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" @@ -53,5 +54,4 @@ cli_deployment_vars: a3u_cluster_size: 2 instance_image_project: "{{ instance_image_project }}" instance_image_family: "{{ instance_image_family }}" - base_network_name: "{{ test_name }}" a3u_enable_spot_vm: true diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml index 4168079c16..be70e6b4a8 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml" region: europe-west1 zone: europe-west1-b -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" remote_node: "{{ hostname_prefix }}-0" post_deploy_tests: - test-validation/test-mounts.yml @@ -31,8 +31,8 @@ custom_vars: mounts: - /home cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" a3u_reservation_name: hpc-exr-2 a3u_provisioning_model: RESERVATION_BOUND - base_network_name: "{{ test_name }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-jbvms.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-jbvms.yml index 4efb37aada..bcf8cc3e13 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-jbvms.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-jbvms.yml @@ -20,7 +20,7 @@ deployment_name: a3u-spot-jbvms-{{ build }} hostname_prefix: "{{ deployment_name }}-beowulf" workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml" -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" remote_node: "{{ hostname_prefix }}-0" post_deploy_tests: - test-validation/test-mounts.yml @@ -32,7 +32,7 @@ custom_vars: a3ultra_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" a3u_provisioning_model: SPOT - base_network_name: "{{ test_name }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-slurm.yml index 32b9125e56..a83fcf9d3e 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-onspot-slurm.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" nccl_test_path: "examples/machine-learning/a3-ultragpu-8g/nccl-tests" post_deploy_tests: - test-validation/test-mounts.yml @@ -46,10 +46,10 @@ custom_vars: a3ultra_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" disk_size_gb: 100 a3u_cluster_size: 2 - base_network_name: "{{ test_name }}" a3u_enable_spot_vm: true diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml index 039494ae87..f5943730fd 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml @@ -24,7 +24,7 @@ login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" region: europe-west1 zone: europe-west1-b -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml @@ -43,10 +43,10 @@ custom_vars: - /home - /gcs cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" disk_size_gb: 100 a3u_cluster_size: 2 a3u_reservation_name: hpc-exr-2 - base_network_name: "{{ test_name }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-custom-blueprint-test.yml b/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-custom-blueprint-test.yml index 33179b8f16..4edd939e8f 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-custom-blueprint-test.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-custom-blueprint-test.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/a4high-custom-image-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml @@ -44,11 +44,11 @@ custom_vars: a4high_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" disk_size_gb: 100 - base_network_name: "{{ test_name }}" a4h_enable_spot_vm: true a4h_cluster_size: 2 diff --git a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-onspot-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-onspot-slurm.yml index cfab6e421f..447de292b2 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-onspot-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-onspot-slurm.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ test_name }}-net-0" +network: "{{ test_name }}" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml @@ -44,10 +44,10 @@ custom_vars: a4high_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" disk_size_gb: 100 a4h_cluster_size: 2 - base_network_name: "{{ test_name }}" a4h_enable_spot_vm: true diff --git a/tools/cloud-build/daily-tests/tests/ml-g4-onspot-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-g4-onspot-slurm.yml index 48ebe17fd6..9feb8e7ed3 100644 --- a/tools/cloud-build/daily-tests/tests/ml-g4-onspot-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-g4-onspot-slurm.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/ml-slurm-g4.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ deployment_name }}-net-0" +network: "{{ test_name }}" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml @@ -41,6 +41,7 @@ custom_vars: g4_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' region: "{{ region }}" zone: "{{ zone }}" slurm_cluster_name: "{{ slurm_cluster_name }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-h4d-onspot-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-h4d-onspot-slurm.yml index 85e9ba1a7e..014624d36f 100644 --- a/tools/cloud-build/daily-tests/tests/ml-h4d-onspot-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-h4d-onspot-slurm.yml @@ -22,7 +22,7 @@ workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hpc-slurm-h4d/hpc-slurm-h4d.yaml" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -network: "{{ deployment_name }}-net" +network: "{{ test_name }}" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml @@ -39,6 +39,7 @@ custom_vars: h4d_onspot: true enable_spot: true cli_deployment_vars: + base_network_name: '{{ test_name }}' deployment_name: "{{ deployment_name }}" region: "{{ region }}" zone: "{{ zone }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm.yml b/tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml similarity index 53% rename from tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm.yml rename to tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml index 67deaa8910..dbbc0672bb 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a4-highgpu-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml @@ -11,42 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - --- - -# region, zone must be defined in build file with --extra-vars flag! -test_name: a4h-slurm -deployment_name: a4h-slurm-{{ build }} -slurm_cluster_name: "a4h{{ build[0:4] }}" +test_name: slurm-rapid-storage +deployment_name: rapid-{{ build }} +slurm_cluster_name: "rapid{{ build[0:4] }}" +region: us-central1 +zone: us-central1-b workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml" +blueprint_yaml: "{{ workspace }}/examples/rapid-storage-slurm.yaml" +network: "{{ test_name }}-net" login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" -region: us-central1 -zone: us-central1-b -network: "{{ test_name }}-net-0" post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -- test-validation/test-default-partition.yml -- test-validation/test-enroot.yml -- test-validation/test-gpus-slurm.yml -post_destroy_tasks: -- post-destroy-tasks/delete-image.yml -custom_vars: - gpu_count: 8 - gpu_partition: a4high - test_persistenced: true - partitions: - - a4high - mounts: - - /home - - /gcs +- test-validation/test-slurm-rapid-storage.yml cli_deployment_vars: region: "{{ region }}" - zone: "{{ zone }}" + network_name: "{{ network }}" slurm_cluster_name: "{{ slurm_cluster_name }}" - disk_size_gb: 100 - a4h_cluster_size: 2 - a4h_reservation_name: nvidia-b200-db38b25a-c93d-4a7e-a3c5-ba4135be357e - base_network_name: "{{ test_name }}" +custom_vars: + project_id: "{{ project }}" + zone: "{{ zone }}" + output_dir: "{{ (lookup('file', blueprint_yaml) | from_yaml)['vars']['gcs_bucket_local_mount'] }}" + mounts: + - "{{ (lookup('file', blueprint_yaml) | from_yaml)['vars']['gcs_bucket_local_mount'] }}" + partitions: + - rapid diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py index 258e7b5c9a..6eadde3cc7 100644 --- a/tools/cloud-build/daily-tests/validate_tests_metadata.py +++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py @@ -30,6 +30,8 @@ "packer", "slurm5", "slurm6", + "gcs-rapid", + "anywhere-cache", "spack", "tpu", "vm", @@ -76,6 +78,7 @@ def get_blueprint(build_path: str) -> Optional[str]: f"{BUILDS_DIR}/slurm-gcp-v6-simple-job-completion.yaml": "tools/python-integration-tests/blueprints/slurm-simple.yaml", f"{BUILDS_DIR}/slurm-flex.yaml": "tools/python-integration-tests/blueprints/slurm-flex.yaml", f"{BUILDS_DIR}/slurm-gcp-v6-topology.yaml": "tools/python-integration-tests/blueprints/topology-test.yaml", + f"{BUILDS_DIR}/slurm-rapid-storage.yaml": "examples/rapid-storage-slurm.yaml", } if build_path in SPECIAL_CASES: return SPECIAL_CASES[build_path] diff --git a/tools/cloud-build/dependency-checks/Dockerfile.precommit b/tools/cloud-build/dependency-checks/Dockerfile.precommit index 016bde044a..224a5f6f7f 100644 --- a/tools/cloud-build/dependency-checks/Dockerfile.precommit +++ b/tools/cloud-build/dependency-checks/Dockerfile.precommit @@ -24,6 +24,6 @@ RUN go install github.com/terraform-docs/terraform-docs@latest && \ go install github.com/fzipp/gocyclo/cmd/gocyclo@latest && \ go install github.com/go-critic/go-critic/cmd/gocritic@latest && \ go install github.com/google/addlicense@latest && \ - go install mvdan.cc/sh/v3/cmd/shfmt@latest && \ - go install golang.org/x/tools/cmd/goimports@latest && \ + go install mvdan.cc/sh/v3/cmd/shfmt@v3.12.0 && \ + go install golang.org/x/tools/cmd/goimports@v0.42.0 && \ go install honnef.co/go/tools/cmd/staticcheck@v0.6.1 diff --git a/tools/cloud-build/provision/daily-cleanup.tf b/tools/cloud-build/provision/daily-cleanup.tf index f7eca28c22..26db27e54b 100644 --- a/tools/cloud-build/provision/daily-cleanup.tf +++ b/tools/cloud-build/provision/daily-cleanup.tf @@ -86,6 +86,6 @@ resource "google_cloudbuild_trigger" "daily_project_cleanup" { module "daily_project_cleanup_schedule" { source = "./trigger-schedule" trigger = google_cloudbuild_trigger.daily_project_cleanup - schedule = "30 23 * * *" + schedule = "30 3-23/4 * * *" retry_count = 4 } diff --git a/tools/publish_release.sh b/tools/publish_release.sh index c58646dd18..5a1981c281 100644 --- a/tools/publish_release.sh +++ b/tools/publish_release.sh @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# jq is required to generate JSON safely for the release manifest. +# Ensure jq is installed on your system before running this script. # bash tools/publish_release.sh # bash tools/publish_release.sh @@ -20,6 +22,12 @@ set -euo pipefail BUCKET_PATH="gs://oss-exit-gate-prod-projects-bucket/cluster-toolkit/githubreleases/manifests/rel.json" +# Check if jq is installed +if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq is required but not installed." >&2 + exit 1 +fi + if [[ $# -eq 2 ]]; then RELEASE_NAME="$1" RELEASE_TAG="$2" @@ -53,17 +61,18 @@ echo "Release Tag : ${RELEASE_TAG}" TMP_FILE="$(mktemp)" trap 'rm -f "${TMP_FILE}"' EXIT -cat >"${TMP_FILE}" <"${TMP_FILE}" echo "Uploading release manifest to GCS..." @@ -72,4 +81,4 @@ if ! gcloud storage cp "${TMP_FILE}" "${BUCKET_PATH}"; then exit 1 fi -echo "Release publish process finished successfully." +echo "Release publish process finished successfully for tag ${RELEASE_TAG}" diff --git a/tools/update-dependencies.sh b/tools/update-dependencies.sh new file mode 100755 index 0000000000..fcf302d130 --- /dev/null +++ b/tools/update-dependencies.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright 2026 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# bash tools/update-dependencies.sh + +set -euo pipefail + +TERRAFORM_VERSION="1.12.2" +PACKER_VERSION="1.15.0" +OUTPUT_FILE="pkg/dependencies/checksums_generated.go" + +mkdir -p "$(dirname "$OUTPUT_FILE")" + +# Create or overwrite the generated file +cat <"$OUTPUT_FILE" +/** + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Code generated by $(basename "$0"). DO NOT EDIT. +package dependencies + +var ( + TerraformVersion = "${TERRAFORM_VERSION}" + PackerVersion = "${PACKER_VERSION}" + + ExpectedChecksums = map[string]string{ +EOF + +process_dependency() { + local name="$1" + local version="$2" + local output_file="$3" + + echo "Fetching ${name} ${version} checksums..." + curl -sSL "https://releases.hashicorp.com/${name}/${version}/${name}_${version}_SHA256SUMS" | while read -r sha256 filename; do + if [[ $filename =~ ${name}_${version}_([a-zA-Z0-9_]+)\.zip$ ]]; then + local os_arch="${BASH_REMATCH[1]}" + echo " \"${name}_${os_arch}\": \"${sha256}\"," >>"${output_file}" + fi + done +} + +process_dependency "terraform" "${TERRAFORM_VERSION}" "${OUTPUT_FILE}" +process_dependency "packer" "${PACKER_VERSION}" "${OUTPUT_FILE}" + +cat <>"$OUTPUT_FILE" + } +) +EOF + +go fmt $OUTPUT_FILE + +echo "Done generating ${OUTPUT_FILE}" diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index 1ad4cb55af..b8ce939c46 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 22a1c2d79e..10a87606c2 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index d9ef805958..2ca0f95a19 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -80,14 +80,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 22a1c2d79e..10a87606c2 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 22a1c2d79e..10a87606c2 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 0d914e6202..6cc2847e09 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 22a1c2d79e..10a87606c2 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 9ec6013499..9bcf14883f 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -48,14 +48,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 6.9.0, <= 7.21.0' + version: '>= 6.9.0, <= 7.23.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 22a1c2d79e..10a87606c2 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 6.9.0, <= 7.21.0" + version = ">= 6.9.0, <= 7.23.0" } } }