Skip to content

Commit be28b89

Browse files
[CI] Attach pod disruption budgets to runner pods (#523)
This patch adds some pod disruption budgets to runner pods that just sets the minimum number of available pods to the maximum. This ensure that the number of pods that k8s calculates can be disrupted is zero. This means that when GKE is updating the node pool, it must wait an hour before forcibly evicting the pod, giving it time to finish. Before this, when GKE wanted to upgrade a node, it would forcibly evict the pod very quickly (theoretically after the grace period which has a default of 30s) not realizing it is stateful.
1 parent 03dad18 commit be28b89

File tree

2 files changed

+40
-0
lines changed

2 files changed

+40
-0
lines changed

premerge/pod_disruption_budget.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: policy/v1
2+
kind: PodDisruptionBudget
3+
metadata:
4+
name: runner-set-pdb
5+
namespace: ${ runner_set_name }
6+
spec:
7+
minAvailable: ${ min_pod_count }
8+
selector:
9+
matchLabels:
10+
actions.github.com/scale-set-name: ${ runner_set_name }

premerge/premerge_resources/main.tf

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,36 @@ resource "kubernetes_service_account" "windows_2022_object_cache_ksa" {
258258
depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_runners]
259259
}
260260

261+
# We set up pod disruption budgets here. We need one per namespace and we need
262+
# to set the min pod count to the maximum number of runner pods that can
263+
# possibly exist so we never have a number of disruptible pods greater than
264+
# zero.
265+
266+
resource "kubernetes_manifest" "linux_runners_disruption_budget" {
267+
manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-linux-runners", min_pod_count : 16 }))
268+
depends_on = [kubernetes_namespace.llvm_premerge_linux_runners]
269+
}
270+
271+
resource "kubernetes_manifest" "windows_2022_runners_disruption_budget" {
272+
manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-windows-2022-runners", min_pod_count : 16 }))
273+
depends_on = [kubernetes_namespace.llvm_premerge_linux_runners]
274+
}
275+
276+
resource "kubernetes_manifest" "libcxx_runners_disruption_budget" {
277+
manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-runners", min_pod_count : 32 }))
278+
depends_on = [kubernetes_namespace.llvm_premerge_linux_runners]
279+
}
280+
281+
resource "kubernetes_manifest" "libcxx_release_runners_disruption_budget" {
282+
manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-release-runners", min_pod_count : 32 }))
283+
depends_on = [kubernetes_namespace.llvm_premerge_linux_runners]
284+
}
285+
286+
resource "kubernetes_manifest" "libcxx_next_runners_disruption_budget" {
287+
manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-next-runners", min_pod_count : 32 }))
288+
depends_on = [kubernetes_namespace.llvm_premerge_linux_runners]
289+
}
290+
261291
resource "kubernetes_namespace" "grafana" {
262292
metadata {
263293
name = "grafana"

0 commit comments

Comments
 (0)