Skip to content

Commit 99906b3

Browse files
committed
enable nested virt & set single process oom kill
1 parent f3ac20e commit 99906b3

File tree

10 files changed

+66
-77
lines changed

10 files changed

+66
-77
lines changed

infra/gcp/terraform/k8s-infra-prow-build/iam.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ module "iam" {
3232
"roles/secretmanager.secretAccessor" = [
3333
"serviceAccount:kubernetes-external-secrets@k8s-infra-prow-build.iam.gserviceaccount.com",
3434
"principal://iam.googleapis.com/projects/${module.project.project_number}/locations/global/workloadIdentityPools/${module.project.project_id}.svc.id.goog/subject/ns/external-secrets/sa/external-secrets",
35+
"principal://iam.googleapis.com/projects/180382678033/locations/global/workloadIdentityPools/k8s-infra-prow-build-trusted.svc.id.goog/subject/ns/external-secrets/sa/external-secrets",
3536
]
3637
}
3738
}

infra/gcp/terraform/k8s-infra-prow-build/main.tf

Lines changed: 25 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,6 @@ resource "google_project_iam_member" "k8s_infra_prow_viewers" {
6060
member = "group:[email protected]"
6161
}
6262

63-
// Allow prow-deployer service account in k8s-infra-prow-build-trusted to deploy
64-
// to the cluster defined in here
65-
resource "google_project_iam_member" "prow_deployer_for_prow_build" {
66-
project = module.project.project_id
67-
role = "roles/container.admin"
68-
member = "serviceAccount:prow-deployer@k8s-infra-prow-build-trusted.iam.gserviceaccount.com"
69-
}
70-
7163
module "prow_build_cluster" {
7264
source = "../modules/gke-cluster"
7365
project_name = module.project.project_id
@@ -86,18 +78,30 @@ module "prow_build_nodepool_c4_highmem_8_localssd" {
8678
cluster_name = module.prow_build_cluster.cluster.name
8779
location = module.prow_build_cluster.cluster.location
8880
node_locations = [
81+
"us-central1-a",
8982
"us-central1-b",
9083
"us-central1-c",
9184
"us-central1-f",
9285
]
93-
name = "pool6"
94-
initial_count = 1
95-
min_count = 1
96-
max_count = 80
97-
machine_type = "c4-highmem-8"
98-
disk_size_gb = 500
99-
disk_type = "hyperdisk-balanced"
100-
service_account = module.prow_build_cluster.cluster_node_sa.email
86+
name = "pool6"
87+
initial_count = 1
88+
min_count = 1
89+
max_count = 250 # total across all zones
90+
machine_type = "c4-highmem-8-lssd"
91+
disk_size_gb = 100
92+
disk_type = "hyperdisk-balanced"
93+
enable_nested_virtualization = true
94+
service_account = module.prow_build_cluster.cluster_node_sa.email
95+
// This taint exists to bias workloads on to the C4D nodepool first, if we can't secure a C4D node
96+
// then we schedule on to a C4 node. C4D performs better than C4 but it is capacity constrained at times.
97+
// Also, nested virt doesn't work on C4D or C4A
98+
taints = [
99+
{
100+
key = "spare"
101+
value = "true"
102+
effect = "PREFER_NO_SCHEDULE"
103+
}
104+
]
101105
}
102106

103107
module "prow_build_nodepool_c4d_highmem_8_localssd" {
@@ -113,53 +117,13 @@ module "prow_build_nodepool_c4d_highmem_8_localssd" {
113117
name = "pool7"
114118
initial_count = 1
115119
min_count = 10
116-
max_count = 80
117-
machine_type = "c4d-highmem-8-lssd" # has 2 local ssd disks attached
120+
max_count = 250 # total across all zones
121+
machine_type = "c4d-highmem-8-lssd" # has 1 local ssd disks attached
118122
disk_size_gb = 100
119123
disk_type = "hyperdisk-balanced"
120124
service_account = module.prow_build_cluster.cluster_node_sa.email
121125
}
122126

123-
124-
module "sig_node_node_pool_1_n4_highmem_8" {
125-
126-
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gke-nodepool?ref=v39.0.0&depth=1"
127-
project_id = module.project.project_id
128-
name = "sig-node-pool1"
129-
location = module.prow_build_cluster.cluster.location
130-
cluster_name = module.prow_build_cluster.cluster.name
131-
132-
service_account = {
133-
email = module.prow_build_cluster.cluster_node_sa.email
134-
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
135-
}
136-
137-
nodepool_config = {
138-
autoscaling = {
139-
max_node_count = 10
140-
min_node_count = 1 # 1 per zone
141-
}
142-
management = {
143-
auto_repair = true
144-
auto_upgrade = true
145-
}
146-
}
147-
148-
node_config = {
149-
machine_type = "n4-highmem-8"
150-
disk_type = "hyperdisk-balanced"
151-
image_type = "COS_CONTAINERD"
152-
gvnic = true
153-
workload_metadata_config_mode = "GKE_METADATA"
154-
shielded_instance_config = {
155-
enable_secure_boot = true
156-
}
157-
}
158-
159-
160-
taints = { dedicated = { value = "sig-node", effect = "NO_SCHEDULE" } }
161-
}
162-
163127
module "prow_build_nodepool_c4a_highmem_8_localssd" {
164128
source = "../modules/gke-nodepool"
165129
project_name = module.project.project_id
@@ -169,11 +133,12 @@ module "prow_build_nodepool_c4a_highmem_8_localssd" {
169133
"us-central1-a",
170134
"us-central1-b",
171135
"us-central1-c",
136+
"us-central1-f",
172137
]
173138
name = "pool7-arm64"
174139
initial_count = 1
175-
min_count = 1
176-
max_count = 10
140+
min_count = 3
141+
max_count = 100 # total across all zones
177142
machine_type = "c4a-highmem-8-lssd" # has 2 local ssd disks attached
178143
disk_size_gb = 100
179144
disk_type = "hyperdisk-balanced"

infra/gcp/terraform/k8s-infra-prow-build/peering.tf

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ limitations under the License.
1515
*/
1616

1717
resource "google_vmwareengine_network_peering" "gvce_peering" {
18-
name = "peer-with-gcve-project"
19-
peer_network = "projects/k8s-infra-prow-build/global/networks/default"
20-
project = module.project.project_id
21-
peer_network_type = "STANDARD"
22-
vmware_engine_network = "projects/broadcom-451918/locations/global/vmwareEngineNetworks/k8s-gcp-gcve-network"
18+
name = "peer-with-gcve-project"
19+
peer_network = "projects/k8s-infra-prow-build/global/networks/default"
20+
project = module.project.project_id
21+
peer_network_type = "STANDARD"
22+
vmware_engine_network = "projects/broadcom-451918/locations/global/vmwareEngineNetworks/k8s-gcp-gcve-network"
23+
export_custom_routes_with_public_ip = true
24+
import_custom_routes_with_public_ip = true
2325
}

infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf renamed to infra/gcp/terraform/k8s-infra-prow-build/provider.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ terraform {
3030
required_providers {
3131
google = {
3232
source = "hashicorp/google"
33-
version = "~> 6.31.0"
33+
version = "~> 7.7.0"
3434
}
3535
google-beta = {
3636
source = "hashicorp/google-beta"
37-
version = "~> 6.31.0"
37+
version = "~> 7.7.0"
3838
}
3939
}
4040
}

infra/gcp/terraform/modules/gke-cluster/versions.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ terraform {
2020
required_providers {
2121
google = {
2222
source = "hashicorp/google"
23-
version = "~> 6.31.0"
23+
version = ">=6.31.0"
2424
}
2525
google-beta = {
2626
source = "hashicorp/google-beta"
27-
version = "~> 6.31.0"
27+
version = ">=6.31.0"
2828
}
2929
}
3030
}

infra/gcp/terraform/modules/gke-nodepool/main.tf

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,17 @@ resource "google_container_node_pool" "node_pool" {
3131
auto_upgrade = true
3232
}
3333

34+
upgrade_settings {
35+
max_unavailable = 0
36+
max_surge = 10
37+
}
38+
3439
// Autoscale the cluster as needed. Note if using a regional cluster these values will be multiplied by 3
3540
initial_node_count = var.initial_count
3641
autoscaling {
37-
min_node_count = var.min_count
38-
max_node_count = var.max_count
42+
total_min_node_count = var.min_count
43+
total_max_node_count = var.max_count
44+
location_policy = "ANY"
3945
}
4046
node_locations = var.node_locations
4147

@@ -49,6 +55,9 @@ resource "google_container_node_pool" "node_pool" {
4955

5056
service_account = var.service_account
5157
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
58+
kubelet_config {
59+
single_process_oom_kill = true # https://github.com/kubernetes-sigs/prow/issues/210
60+
}
5261

5362
dynamic "ephemeral_storage_config" {
5463
for_each = var.ephemeral_local_ssd_count > 0 ? [var.ephemeral_local_ssd_count] : []
@@ -57,6 +66,11 @@ resource "google_container_node_pool" "node_pool" {
5766
}
5867
}
5968

69+
advanced_machine_features {
70+
enable_nested_virtualization = var.enable_nested_virtualization
71+
threads_per_core = 0
72+
}
73+
6074
// Needed for workload identity
6175
workload_metadata_config {
6276
mode = "GKE_METADATA"
@@ -72,6 +86,7 @@ resource "google_container_node_pool" "node_pool" {
7286
value = taint.value.value
7387
}
7488
}
89+
7590
}
7691

7792
// If we need to destroy the node pool, create the new one before destroying

infra/gcp/terraform/modules/gke-nodepool/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,9 @@ variable "service_account" {
107107
description = "The email address of the GCP Service Account to be associated with nodes in this node_pool"
108108
type = string
109109
}
110+
111+
variable "enable_nested_virtualization" {
112+
description = "Whether to enable nested virtualization on the node pool's VMs"
113+
type = bool
114+
default = false
115+
}

infra/gcp/terraform/modules/gke-nodepool/versions.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ terraform {
2020
required_providers {
2121
google = {
2222
source = "hashicorp/google"
23-
version = "~> 6.31.0"
23+
version = ">=6.31.0"
2424
}
2525
google-beta = {
2626
source = "hashicorp/google-beta"
27-
version = "~> 6.31.0"
27+
version = ">=6.31.0"
2828
}
2929
}
3030
}

infra/gcp/terraform/modules/gke-project/versions.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ terraform {
2020
required_providers {
2121
google = {
2222
source = "hashicorp/google"
23-
version = "~> 6.31.0"
23+
version = ">=6.31.0"
2424
}
2525
google-beta = {
2626
source = "hashicorp/google-beta"
27-
version = "~> 6.31.0"
27+
version = ">=6.31.0"
2828
}
2929
}
3030
}

infra/gcp/terraform/modules/workload-identity-service-account/versions.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ terraform {
1717
required_providers {
1818
google = {
1919
source = "hashicorp/google"
20-
version = "~> 6.31.0"
20+
version = ">=6.31.0"
2121
}
2222
google-beta = {
2323
source = "hashicorp/google-beta"
24-
version = "~> 6.31.0"
24+
version = ">=6.31.0"
2525
}
2626
}
2727
}

0 commit comments

Comments
 (0)