Skip to content

Commit b54535f

Browse files
committed
feat: google build is working
Signed-off-by: vsoch <[email protected]>
1 parent a902cc9 commit b54535f

File tree

20 files changed

+1336
-338
lines changed

20 files changed

+1336
-338
lines changed

tutorial/google/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Flux on Google Cloud
2+
3+
This is a setup akin to Cluster Toolkit (using Terraform) to run Flux on Google Cloud.
4+
Google Cloud does not support any kind of special networking, so we will rely on ethernet. This setup comes also with Singularity and ORAS. You'll need to build from [build-images](build-images). Since we can change the instance on the fly (generally speaking) we just have one build and terraform directory.
5+
6+
## Usage
7+
8+
### 1. Create Google Service Accounts
9+
10+
Create default application credentials (just once):
11+
12+
```bash
13+
gcloud auth application-default login
14+
```
15+
16+
### 2. Build Base Image
17+
18+
You can build the base VM with [build-images](build-images). This is working with packer (!) so you should look at the main packer HCL files, see if you want to customize anything, and then just:
19+
20+
```bash
21+
make
22+
```
23+
24+
The install script is in [build-images/build.sh](build-images/build.sh), and you can customize it as you like. Note that the Makefile has a setting so that when a command fails, it waits for your response. I recommend that you shell into the VM if this happens to debug, and then do it again when you've found and fixed the issue. If you don't change anything, it should work as is.
25+
26+
### 3. Terraform
27+
28+
Next, cd into [tf](tf) and again open [tf/basic.tfvars](tf/basic.tfvars) to look at the metadata and update anything as needed. I recommend starting at a small scale first. Then bring it up!
29+
30+
```bash
31+
make
32+
```
33+
34+
When you are done:
35+
36+
```bash
37+
make destroy
38+
```
39+
40+
Note that I had issues with a fully terraform teardown, so I wrote a script that asks for the number of instances, and uses gcloud to supplement.

tutorial/google/base/main.tf

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
module "compute_nodes" {
16+
source = "./modules/compute"
17+
18+
for_each = {
19+
for index, node in var.compute_node_specs:
20+
node.name_prefix => node
21+
}
22+
project_id = var.project_id
23+
region = var.region
24+
25+
family = var.family
26+
27+
name_prefix = each.value.name_prefix
28+
subnetwork = var.subnetwork
29+
machine_arch = each.value.machine_arch
30+
machine_type = each.value.machine_type
31+
num_instances = each.value.instances
32+
33+
boot_script = lookup(each.value, "boot_script", null)
34+
compact_placement = lookup(each.value, "compact", false)
35+
gpu = lookup(each.value, "gpu_type", null) == null || lookup(each.value, "gpu_count", 0) <= 0 ? null : {
36+
type = each.value.gpu_type
37+
count = each.value.gpu_count
38+
}
39+
service_account = {
40+
email = var.service_account_emails["compute"]
41+
scopes = var.compute_scopes
42+
}
43+
nfs_mounts = var.cluster_storage
44+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
data "google_compute_image" "fluxfw_compute_x86_64_image" {
16+
project = var.project_id
17+
family = var.family
18+
}
19+
20+
data "google_compute_zones" "available" {
21+
project = var.project_id
22+
region = var.region
23+
}
24+
25+
locals {
26+
automatic_restart = var.compact_placement ? false : var.automatic_restart
27+
compute_images = {
28+
"x86-64" = {
29+
image = data.google_compute_image.fluxfw_compute_x86_64_image.self_link
30+
project = data.google_compute_image.fluxfw_compute_x86_64_image.project
31+
}
32+
}
33+
on_host_maintenance = var.compact_placement ? "TERMINATE" : var.on_host_maintenance
34+
}
35+
36+
resource "google_compute_resource_policy" "collocated" {
37+
count = var.compact_placement ? 1 : 0
38+
name = "${var.name_prefix}-collocated-policy"
39+
project = var.project_id
40+
region = var.region
41+
group_placement_policy {
42+
vm_count = var.num_instances
43+
collocation = "COLLOCATED"
44+
}
45+
}
46+
47+
module "flux_compute_instance_template" {
48+
source = "github.com/terraform-google-modules/terraform-google-vm/modules/instance_template"
49+
region = var.region
50+
project_id = var.project_id
51+
name_prefix = var.name_prefix
52+
subnetwork = var.subnetwork
53+
gpu = var.gpu
54+
service_account = var.service_account
55+
tags = ["ssh", "flux", "compute"]
56+
machine_type = var.machine_type
57+
disk_size_gb = 256
58+
source_image = local.compute_images["${var.machine_arch}"].image
59+
source_image_project = local.compute_images["${var.machine_arch}"].project
60+
automatic_restart = local.automatic_restart
61+
on_host_maintenance = local.on_host_maintenance
62+
startup_script = var.boot_script
63+
64+
metadata = {
65+
"enable-oslogin" : "TRUE",
66+
"VmDnsSetting" : "GlobalDefault",
67+
"nfs-mounts" : jsonencode(var.nfs_mounts),
68+
"gpus-attached" : var.gpu != null ? "TRUE" : "FALSE"
69+
}
70+
}
71+
72+
module "flux_compute_instances" {
73+
source = "github.com/terraform-google-modules/terraform-google-vm/modules/compute_instance"
74+
region = var.region
75+
zone = data.google_compute_zones.available.names[0]
76+
hostname = var.name_prefix
77+
add_hostname_suffix = true
78+
num_instances = var.num_instances
79+
resource_policies = var.compact_placement ? [ google_compute_resource_policy.collocated[0].self_link ] : []
80+
instance_template = module.flux_compute_instance_template.self_link
81+
subnetwork = var.subnetwork
82+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
variable "automatic_restart" {
2+
type = bool
3+
description = "(Optional) Specifies whether the instance should be automatically restarted if it is terminated by Compute Engine (not terminated by a user)."
4+
default = true
5+
}
6+
7+
variable "boot_script" {
8+
description = "(Optional) the name of a file containing a script to be executed on compute nodes at boot time"
9+
type = string
10+
default = null
11+
}
12+
13+
variable "compact_placement" {
14+
description = "(Optional) a boolean which determines whether a set of compute nodes has a compact placement resource policy attached to them."
15+
type = bool
16+
default = false
17+
}
18+
19+
variable "family" {
20+
description = "The source X86 image family prefix to use"
21+
type = string
22+
default = "flux-framework"
23+
}
24+
25+
variable "gpu" {
26+
description = "The type and count of GPU(s) to attach to a compute node"
27+
type = object({
28+
type = string
29+
count = number
30+
})
31+
default = null
32+
}
33+
34+
variable "machine_arch" {
35+
description = "The instruction set architecture, usually x86_64, used by the compute node"
36+
type = string
37+
}
38+
39+
variable "machine_type" {
40+
description = "The Compute Engine machine type to be used for the compute node"
41+
type = string
42+
}
43+
44+
variable "name_prefix" {
45+
description = "The name prefix for the compute node instances, the full instances names will be this prefix followed by a node number"
46+
type = string
47+
}
48+
49+
variable "nfs_mounts" {
50+
description = "A map with keys 'share' and 'mountpoint' describing an NFS export and its intended mount point"
51+
type = map(string)
52+
default = {}
53+
}
54+
55+
variable "num_instances" {
56+
description = "The number of compute node instances to create"
57+
type = number
58+
default = 1
59+
}
60+
61+
variable "on_host_maintenance" {
62+
type = string
63+
description = "Instance availability Policy"
64+
default = "MIGRATE"
65+
}
66+
67+
variable "project_id" {
68+
description = "The GCP project ID"
69+
type = string
70+
}
71+
72+
variable "region" {
73+
description = "The GCP region where the cluster resides"
74+
type = string
75+
}
76+
77+
variable "service_account" {
78+
description = "The GCP service account used by the compute node"
79+
type = object({
80+
email = string
81+
scopes = set(string)
82+
})
83+
}
84+
85+
variable "subnetwork" {
86+
description = "Subnetwork to deploy to"
87+
type = string
88+
}

tutorial/google/base/variables.tf

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
variable "cluster_storage" {
3+
description = "A map with keys 'share' and 'mountpoint' describing an NFS export and its intended mount point"
4+
type = map(string)
5+
}
6+
7+
variable "family" {
8+
description = "The source image x86 prefix to be used by the compute node(s)"
9+
type = string
10+
default = "global"
11+
}
12+
13+
variable "compute_node_specs" {
14+
description = "A list of compute node specifications"
15+
type = list(object({
16+
name_prefix = string
17+
machine_arch = string
18+
machine_type = string
19+
gpu_type = string
20+
gpu_count = number
21+
compact = bool
22+
instances = number
23+
properties = set(string)
24+
boot_script = string
25+
}))
26+
default = []
27+
}
28+
29+
variable "compute_scopes" {
30+
description = "The set of access scopes for compute node instances"
31+
default = [ "cloud-platform" ]
32+
type = set(string)
33+
}
34+
35+
variable "project_id" {
36+
description = "The GCP project ID"
37+
type = string
38+
}
39+
40+
variable "region" {
41+
description = "The GCP region where the cluster resides"
42+
type = string
43+
}
44+
45+
variable "service_account_emails" {
46+
description = "A map with keys: 'compute', 'login', 'manager' that map to the service account to be used by the respective nodes"
47+
type = map(string)
48+
}
49+
50+
variable "subnetwork" {
51+
description = "Subnetwork to deploy to"
52+
type = string
53+
}

tutorial/google/build-images/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@ validate:
1515

1616
.PHONY: build
1717
build:
18-
# Rocky-8-EC2-Base-8.7-20230215.0.x86_64-d6577ceb-8ea8-4e0e-84c6-f098fc302e82
19-
packer build -var machine_architecture="x86-64" -var project_id=${GOOGLE_PROJECT} build.pkr.hcl
20-
# gcloud builds submit --config=cloudbuild.yaml .
18+
# On error, the best strategy I've found is to ssh in, figure out what isn't there
19+
# (where it failed) and debug
20+
packer build --on-error=ask flux-build.pkr.hcl

0 commit comments

Comments
 (0)