diff --git a/.ci/infrastructure/.gitignore b/.ci/infrastructure/.gitignore new file mode 100644 index 0000000000000..e79eb23105214 --- /dev/null +++ b/.ci/infrastructure/.gitignore @@ -0,0 +1 @@ +.terraform* diff --git a/.ci/infrastructure/README.md b/.ci/infrastructure/README.md new file mode 100644 index 0000000000000..b01652d87a002 --- /dev/null +++ b/.ci/infrastructure/README.md @@ -0,0 +1,31 @@ +# Premerge Infrastructure + +This folder contains the terraform configuration files that define the GCP +resources used to run the premerge checks. Currently, only Google employees +with access to the GCP project where these checks are hosted are able to apply +changes. Pull requests from anyone are still welcome. + +## Setup + +- install terraform (https://developer.hashicorp.com/terraform/install?product_intent=terraform) +- get the GCP tokens: `gcloud auth application-default login` +- initialize terraform: `terraform init` + +To apply any changes to the cluster: +- setup the cluster: `terraform apply` +- terraform will list the list of proposed changes. +- enter 'yes' when prompted. + +## Setting the cluster up for the first time + +``` +terraform apply -target google_container_node_pool.llvm_premerge_linux_service +terraform apply -target google_container_node_pool.llvm_premerge_linux +terraform apply -target google_container_node_pool.llvm_premerge_windows +terraform apply +``` + +Setting the cluster up for the first time is more involved as there are certain +resources where terraform is unable to handle explicit dependencies. This means +that we have to set up the GKE cluster before we setup any of the Kubernetes +resources as otherwise the Terraform Kubernetes provider will error out. diff --git a/.ci/infrastructure/backend.tf b/.ci/infrastructure/backend.tf new file mode 100644 index 0000000000000..9fe5e692a9c22 --- /dev/null +++ b/.ci/infrastructure/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "gcs" { + bucket = "3772b2f502380a18-terraform-remote-backend" + } +} diff --git a/.ci/infrastructure/grafana_values.yaml b/.ci/infrastructure/grafana_values.yaml new file mode 100644 index 0000000000000..9acdd429b9ed8 --- /dev/null +++ b/.ci/infrastructure/grafana_values.yaml @@ -0,0 +1,41 @@ +metrics: + enabled: true + alloy: + metricsTuning: + useIntegrationAllowList: true + cost: + enabled: true + kepler: + enabled: true + node-exporter: + enabled: true +logs: + enabled: true + pod_logs: + enabled: true + cluster_events: + enabled: true +traces: + enabled: true +receivers: + grpc: + enabled: true + http: + enabled: true + zipkin: + enabled: true + grafanaCloudMetrics: + enabled: false +opencost: + enabled: true +kube-state-metrics: + enabled: true +prometheus-node-exporter: + enabled: true +prometheus-operator-crds: + enabled: true +kepler: + enabled: true +alloy: {} +alloy-events: {} +alloy-logs: {} diff --git a/.ci/infrastructure/linux_container_pod_template.yaml b/.ci/infrastructure/linux_container_pod_template.yaml new file mode 100644 index 0000000000000..c7d1baf3515a2 --- /dev/null +++ b/.ci/infrastructure/linux_container_pod_template.yaml @@ -0,0 +1,26 @@ +spec: + tolerations: + - key: "premerge-platform" + operator: "Equal" + value: "linux" + effect: "NoSchedule" + nodeSelector: + premerge-platform: linux + containers: + - name: $job + resources: + # The container is always scheduled on the same pod as the runner. + # Since we use the runner requests.cpu for scheduling/autoscaling, + # the request here should be set to something small. + # + # The limit however should be the number of cores of the node. Any limit + # inferior to the number of core could slow down the job. + # + # For memory however, the request/limits shall be correct. + # It's not used for scheduling, but can be used by k8 for OOM kill. + requests: + cpu: "100m" + memory: "50Gi" + limits: + cpu: 56 + memory: "100Gi" diff --git a/.ci/infrastructure/linux_runners_values.yaml b/.ci/infrastructure/linux_runners_values.yaml new file mode 100644 index 0000000000000..407e81436fafd --- /dev/null +++ b/.ci/infrastructure/linux_runners_values.yaml @@ -0,0 +1,74 @@ +githubConfigUrl: "https://github.com/llvm" +githubConfigSecret: "github-token" + +minRunners: 0 +maxRunners: 4 + +containerMode: + type: "kubernetes" + kubernetesModeWorkVolumeClaim: + accessModes: ["ReadWriteOnce"] + storageClassName: "standard-rwo" + resources: + requests: + storage: "100Gi" + kubernetesModeServiceAccount: + annotations: + +template: + spec: + tolerations: + - key: "premerge-platform" + operator: "Equal" + value: "linux" + effect: "NoSchedule" + nodeSelector: + premerge-platform: linux + containers: + - name: runner + image: ghcr.io/actions/actions-runner:latest + command: ["/home/runner/run.sh"] + resources: + # The container will be scheduled on the same node as this runner. + # This means if we don't set the CPU request high-enough here, 2 + # containers will be scheduled on the same pod, meaning 2 jobs. + # + # This number should be: + # - greater than number_of_cores / 2: + # A value lower than that could allow the scheduler to put 2 + # runners in the same pod. Meaning 2 containers in the same pod. + # Meaning 2 jobs sharing the resources. + # - lower than number_of_cores: + # Each pod has some basic services running (metrics for ex). Those + # already require some amount of CPU (~0.5). This means we don't + # exactly have N cores to allocate, but N - epsilon. + # + # Memory however shall be handled at the container level. The runner + # itself doesn't need much, just using something enough not to get + # OOM killed. + requests: + cpu: 50 + memory: "2Gi" + limits: + cpu: 56 + memory: "2Gi" + env: + - name: ACTIONS_RUNNER_CONTAINER_HOOKS + value: /home/runner/k8s/index.js + - name: ACTIONS_RUNNER_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: ACTIONS_RUNNER_REQUIRE_JOB_CONTAINER + value: "true" + - name: ACTIONS_RUNNER_CONTAINER_HOOK_TEMPLATE + value: "/home/runner/pod-config/linux-container-pod-template.yaml" + volumeMounts: + - name: container-pod-config + mountPath: /home/runner/pod-config + securityContext: + fsGroup: 123 + volumes: + - name: container-pod-config + configMap: + name: linux-container-pod-template diff --git a/.ci/infrastructure/main.tf b/.ci/infrastructure/main.tf new file mode 100644 index 0000000000000..4c7f554a97bb4 --- /dev/null +++ b/.ci/infrastructure/main.tf @@ -0,0 +1,375 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "4.51.0" + } + } +} + +provider "google" { + project = "llvm-premerge-checks" +} + +resource "random_id" "default" { + byte_length = 8 +} + +resource "google_storage_bucket" "terraform_state_bucket" { + name = "${random_id.default.hex}-terraform-remote-backend" + location = "US" + + force_destroy = false + public_access_prevention = "enforced" + uniform_bucket_level_access = true + + versioning { + enabled = true + } +} + +resource "local_file" "terraform_state" { + file_permission = "0644" + filename = "${path.module}/backend.tf" + + content = <<-EOT + terraform { + backend "gcs" { + bucket = "${google_storage_bucket.terraform_state_bucket.name}" + } + } + EOT +} + +data "google_client_config" "current" {} + +resource "google_container_cluster" "llvm_premerge" { + name = var.cluster_name + location = "europe-west3-a" + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + + # Set the networking mode to VPC Native to enable IP aliasing, which is required + # for adding windows nodes to the cluster. + networking_mode = "VPC_NATIVE" + ip_allocation_policy {} +} + +resource "google_container_node_pool" "llvm_premerge_linux_service" { + name = "llvm-premerge-linux-service" + location = "europe-west3-a" + cluster = google_container_cluster.llvm_premerge.name + node_count = 2 + + node_config { + machine_type = "e2-small" + } +} + +resource "google_container_node_pool" "llvm_premerge_linux" { + name = "llvm-premerge-linux" + location = "europe-west3-a" + cluster = google_container_cluster.llvm_premerge.name + initial_node_count = 0 + + autoscaling { + total_min_node_count = 0 + total_max_node_count = 4 + } + + node_config { + machine_type = "c2d-highcpu-56" + taint = [{ + key = "premerge-platform" + value = "linux" + effect = "NO_SCHEDULE" + }] + labels = { + "premerge-platform" : "linux" + } + } +} + +resource "google_container_node_pool" "llvm_premerge_windows" { + name = "llvm-premerge-windows" + location = "europe-west3-a" + cluster = google_container_cluster.llvm_premerge.name + initial_node_count = 0 + + autoscaling { + total_min_node_count = 0 + total_max_node_count = 2 + } + + # We do not set a taint for the windows nodes as kubernetes by default sets + # a node.kubernetes.io/os taint for windows nodes. + node_config { + machine_type = "c2d-highcpu-56" + labels = { + "premerge-platform" : "windows" + } + image_type = "WINDOWS_LTSC_CONTAINERD" + } +} + +provider "helm" { + kubernetes { + host = google_container_cluster.llvm_premerge.endpoint + token = data.google_client_config.current.access_token + client_certificate = base64decode(google_container_cluster.llvm_premerge.master_auth.0.client_certificate) + client_key = base64decode(google_container_cluster.llvm_premerge.master_auth.0.client_key) + cluster_ca_certificate = base64decode(google_container_cluster.llvm_premerge.master_auth.0.cluster_ca_certificate) + } +} + +data "google_secret_manager_secret_version" "github_app_id" { + secret = "llvm-premerge-github-app-id" +} + +data "google_secret_manager_secret_version" "github_app_installation_id" { + secret = "llvm-premerge-github-app-installation-id" +} + +data "google_secret_manager_secret_version" "github_app_private_key" { + secret = "llvm-premerge-github-app-private-key" +} + +data "google_secret_manager_secret_version" "grafana_token" { + secret = "llvm-premerge-testing-grafana-token" +} + +provider "kubernetes" { + host = "https://${google_container_cluster.llvm_premerge.endpoint}" + token = data.google_client_config.current.access_token + cluster_ca_certificate = base64decode( + google_container_cluster.llvm_premerge.master_auth[0].cluster_ca_certificate, + ) +} + +resource "kubernetes_namespace" "llvm_premerge_controller" { + metadata { + name = "llvm-premerge-controller" + } +} + +resource "kubernetes_namespace" "llvm_premerge_linux_runners" { + metadata { + name = "llvm-premerge-linux-runners" + } +} + +resource "kubernetes_secret" "linux_github_pat" { + metadata { + name = "github-token" + namespace = "llvm-premerge-linux-runners" + } + + data = { + "github_app_id" = data.google_secret_manager_secret_version.github_app_id.secret_data + "github_app_installation_id" = data.google_secret_manager_secret_version.github_app_installation_id.secret_data + "github_app_private_key" = data.google_secret_manager_secret_version.github_app_private_key.secret_data + } + + type = "Opaque" +} + +resource "kubernetes_namespace" "llvm_premerge_windows_runners" { + metadata { + name = "llvm-premerge-windows-runners" + } +} + +resource "kubernetes_secret" "windows_github_pat" { + metadata { + name = "github-token" + namespace = "llvm-premerge-windows-runners" + } + + data = { + "github_app_id" = data.google_secret_manager_secret_version.github_app_id.secret_data + "github_app_installation_id" = data.google_secret_manager_secret_version.github_app_installation_id.secret_data + "github_app_private_key" = data.google_secret_manager_secret_version.github_app_private_key.secret_data + } + + type = "Opaque" +} + + +resource "kubernetes_config_map" "linux_container_pod_template" { + metadata { + name = "linux-container-pod-template" + namespace = "llvm-premerge-linux-runners" + } + + data = { + "linux-container-pod-template.yaml" : "${file("linux_container_pod_template.yaml")}" + } +} + +resource "helm_release" "github_actions_runner_controller" { + name = "llvm-premerge-controller" + namespace = "llvm-premerge-controller" + repository = "oci://ghcr.io/actions/actions-runner-controller-charts" + version = "0.9.3" + chart = "gha-runner-scale-set-controller" + + depends_on = [ + kubernetes_namespace.llvm_premerge_controller + ] +} + +resource "helm_release" "github_actions_runner_set_linux" { + name = "llvm-premerge-linux-runners" + namespace = "llvm-premerge-linux-runners" + repository = "oci://ghcr.io/actions/actions-runner-controller-charts" + version = "0.9.3" + chart = "gha-runner-scale-set" + + values = [ + "${file("linux_runners_values.yaml")}" + ] + + depends_on = [ + kubernetes_namespace.llvm_premerge_linux_runners, + kubernetes_config_map.linux_container_pod_template, + kubernetes_secret.linux_github_pat + ] +} + +resource "helm_release" "github_actions_runner_set_windows" { + name = "llvm-premerge-windows-runners" + namespace = "llvm-premerge-windows-runners" + repository = "oci://ghcr.io/actions/actions-runner-controller-charts" + version = "0.9.3" + chart = "gha-runner-scale-set" + + values = [ + "${file("windows_runner_values.yaml")}" + ] + + depends_on = [ + kubernetes_namespace.llvm_premerge_windows_runners, + kubernetes_secret.windows_github_pat + ] +} + +resource "kubernetes_namespace" "grafana" { + metadata { + name = "grafana" + } +} + +resource "helm_release" "grafana-k8s-monitoring" { + name = "grafana-k8s-monitoring" + repository = "https://grafana.github.io/helm-charts" + chart = "k8s-monitoring" + namespace = "grafana" + create_namespace = true + atomic = true + timeout = 300 + + values = [file("${path.module}/grafana_values.yaml")] + + set { + name = "cluster.name" + value = var.cluster_name + } + + set { + name = "externalServices.prometheus.host" + value = var.externalservices_prometheus_host + } + + set_sensitive { + name = "externalServices.prometheus.basicAuth.username" + value = var.externalservices_prometheus_basicauth_username + } + + set_sensitive { + name = "externalServices.prometheus.basicAuth.password" + value = data.google_secret_manager_secret_version.grafana_token.secret_data + } + + set { + name = "externalServices.loki.host" + value = var.externalservices_loki_host + } + + set_sensitive { + name = "externalServices.loki.basicAuth.username" + value = var.externalservices_loki_basicauth_username + } + + set_sensitive { + name = "externalServices.loki.basicAuth.password" + value = data.google_secret_manager_secret_version.grafana_token.secret_data + } + + set { + name = "externalServices.tempo.host" + value = var.externalservices_tempo_host + } + + set_sensitive { + name = "externalServices.tempo.basicAuth.username" + value = var.externalservices_tempo_basicauth_username + } + + set_sensitive { + name = "externalServices.tempo.basicAuth.password" + value = data.google_secret_manager_secret_version.grafana_token.secret_data + } + + set { + name = "opencost.opencost.exporter.defaultClusterId" + value = var.cluster_name + } + + set { + name = "opencost.opencost.prometheus.external.url" + value = format("%s/api/prom", var.externalservices_prometheus_host) + } + + depends_on = [ kubernetes_namespace.grafana ] +} + +data "google_secret_manager_secret_version" "metrics_github_pat" { + secret = "llvm-premerge-metrics-github-pat" +} + +data "google_secret_manager_secret_version" "metrics_grafana_api_key" { + secret = "llvm-premerge-metrics-grafana-api-key" +} + +data "google_secret_manager_secret_version" "metrics_grafana_metrics_userid" { + secret = "llvm-premerge-metrics-grafana-metrics-userid" +} + +resource "kubernetes_namespace" "metrics" { + metadata { + name = "metrics" + } +} + +resource "kubernetes_secret" "metrics_secrets" { + metadata { + name = "metrics-secrets" + namespace = "metrics" + } + + data = { + "github-token" = data.google_secret_manager_secret_version.metrics_github_pat.secret_data + "grafana-api-key" = data.google_secret_manager_secret_version.metrics_grafana_api_key.secret_data + "grafana-metrics-userid" = data.google_secret_manager_secret_version.metrics_grafana_metrics_userid.secret_data + } + + type = "Opaque" +} + +resource "kubernetes_manifest" "metrics_deployment" { + manifest = yamldecode(file("metrics_deployment.yaml")) +} diff --git a/.ci/infrastructure/metrics_deployment.yaml b/.ci/infrastructure/metrics_deployment.yaml new file mode 100644 index 0000000000000..34f5f69012235 --- /dev/null +++ b/.ci/infrastructure/metrics_deployment.yaml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metrics + namespace: metrics + labels: + app: metrics +spec: + replicas: 1 + selector: + matchLabels: + app: metrics + template: + metadata: + labels: + app: metrics + spec: + containers: + - name: metrics + # TODO(boomanaiden154): Update this image when we have the metrics + # container build job in the monorepo. + image: ghcr.io/keenuts-test-org/llvm-premerge-infra-metrics:debug + env: + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: metrics-secrets + key: github-token + - name: GRAFANA_API_KEY + valueFrom: + secretKeyRef: + name: metrics-secrets + key: grafana-api-key + - name: GRAFANA_METRICS_USERID + valueFrom: + secretKeyRef: + name: metrics-secrets + key: grafana-metrics-userid diff --git a/.ci/infrastructure/vars.tf b/.ci/infrastructure/vars.tf new file mode 100644 index 0000000000000..7584b27a6f3e3 --- /dev/null +++ b/.ci/infrastructure/vars.tf @@ -0,0 +1,34 @@ +variable "cluster_name" { + type = string + default = "llvm-premerge-prototype" +} + +variable "externalservices_prometheus_host" { + type = string + default = "https://prometheus-prod-13-prod-us-east-0.grafana.net" +} + +variable "externalservices_prometheus_basicauth_username" { + type = number + default = 1716097 +} + +variable "externalservices_loki_host" { + type = string + default = "https://logs-prod-006.grafana.net" +} + +variable "externalservices_loki_basicauth_username" { + type = number + default = 957850 +} + +variable "externalservices_tempo_host" { + type = string + default = "https://tempo-prod-04-prod-us-east-0.grafana.net:443" +} + +variable "externalservices_tempo_basicauth_username" { + type = number + default = 952165 +} diff --git a/.ci/infrastructure/windows_runner_values.yaml b/.ci/infrastructure/windows_runner_values.yaml new file mode 100644 index 0000000000000..6d2977621b133 --- /dev/null +++ b/.ci/infrastructure/windows_runner_values.yaml @@ -0,0 +1,32 @@ +githubConfigUrl: "https://github.com/llvm" +githubConfigSecret: "github-token" + +minRunners: 0 +maxRunners: 2 + +template: + spec: + tolerations: + - key: "node.kubernetes.io/os" + operator: "Equal" + value: "windows" + effect: "NoSchedule" + - key: "premerge-platform" + operator: "Equal" + value: "windows" + effect: "NoSchedule" + nodeSelector: + premerge-platform: windows + containers: + - name: runner + resources: + requests: + cpu: 50 + # TODO(boomanaiden154): Update this image when we have a proper windows + # image ready. + image: ghcr.io/keenuts-test-org/windows-ci-image:latest + command: ["run.cmd"] + args: ["--jitconfig", "$(ACTIONS_RUNNER_INPUT_JITCONFIG)"] + env: + - name: DISABLE_RUNNER_UPDATE + value: "true"