diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index d1ae83c68..26b5ac4b0 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -234,6 +234,14 @@ jobs: helm_version: "" experimental: false + - federation_member: ovh2 + binder_url: https://ovh2.mybinder.org + hub_url: https://hub.ovh2.mybinder.org + # image-prefix should match ovh registry config in secrets/config/ovh.yaml + chartpress_args: "--push --image-prefix=2lmrrh8f.gra7.container-registry.ovh.net/mybinder-chart/mybinder-" + helm_version: "" + experimental: false + steps: - name: "Stage 0: Update env vars based on job matrix arguments" run: | @@ -288,7 +296,7 @@ jobs: GIT_CRYPT_KEY: ${{ secrets.GIT_CRYPT_KEY }} # Action Repo: https://github.com/Azure/docker-login - - name: "Stage 3: Login to Docker regstry (OVH)" + - name: "Stage 3: Login to Docker registry (OVH)" if: matrix.federation_member == 'ovh' uses: azure/docker-login@v1 with: @@ -296,6 +304,15 @@ jobs: username: ${{ secrets.DOCKER_USERNAME_OVH }} password: ${{ secrets.DOCKER_PASSWORD_OVH }} + - name: "Stage 3: Login to Docker registry (OVH2)" + if: matrix.federation_member == 'ovh2' + uses: azure/docker-login@v1 + with: + login-server: 2lmrrh8f.gra7.container-registry.ovh.net + username: ${{ secrets.DOCKER_USERNAME_OVH2 }} + # terraform output registry_chartpress_token + password: ${{ secrets.DOCKER_PASSWORD_OVH2 }} + - name: "Stage 3: Run chartpress to update values.yaml" run: | chartpress ${{ matrix.chartpress_args || '--skip-build' }} diff --git a/.github/workflows/test-helm-template.yaml b/.github/workflows/test-helm-template.yaml index ceebd5d11..dee7728f8 100644 --- a/.github/workflows/test-helm-template.yaml +++ b/.github/workflows/test-helm-template.yaml @@ -47,6 +47,8 @@ jobs: k3s-channel: "v1.21" - release: ovh k3s-channel: "v1.20" + - release: ovh2 + k3s-channel: "v1.24" - release: turing k3s-channel: "v1.21" diff --git a/.gitignore b/.gitignore index 987e39a2d..340b738a8 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ travis/crypt-key env .terraform +.terraform.lock.hcl diff --git a/config/ovh2.yaml b/config/ovh2.yaml new file mode 100644 index 000000000..ee5a569f9 --- /dev/null +++ b/config/ovh2.yaml @@ -0,0 +1,125 @@ +projectName: ovh2 + +userNodeSelector: &userNodeSelector + mybinder.org/pool-type: users +coreNodeSelector: &coreNodeSelector + mybinder.org/pool-type: core + +binderhub: + config: + BinderHub: + pod_quota: 10 + hub_url: https://hub.ovh2.mybinder.org + badge_base_url: https://mybinder.org + build_node_selector: *userNodeSelector + sticky_builds: true + image_prefix: 2lmrrh8f.gra7.container-registry.ovh.net/mybinder-builds/r2d-g5b5b759 + DockerRegistry: + # Docker Registry uses harbor + # ref: https://github.com/goharbor/harbor/wiki/Harbor-FAQs#api + token_url: "https://2lmrrh8f.gra7.container-registry.ovh.net/service/token?service=harbor-registry" + + replicas: 1 + nodeSelector: *coreNodeSelector + + extraVolumes: + - name: secrets + secret: + secretName: events-archiver-secrets + extraVolumeMounts: + - name: secrets + mountPath: /secrets + readOnly: true + extraEnv: + GOOGLE_APPLICATION_CREDENTIALS: /secrets/service-account.json + + ingress: + hosts: + - ovh2.mybinder.org + + jupyterhub: + singleuser: + nodeSelector: *userNodeSelector + hub: + nodeSelector: *coreNodeSelector + + proxy: + chp: + nodeSelector: *coreNodeSelector + resources: + requests: + cpu: "1" + limits: + cpu: "1" + ingress: + hosts: + - hub.ovh2.mybinder.org + tls: + - secretName: kubelego-tls-hub + hosts: + - hub.ovh2.mybinder.org + scheduling: + userPlaceholder: + replicas: 5 + userScheduler: + nodeSelector: *coreNodeSelector + + imageCleaner: + # Use 40GB as upper limit, size is given in bytes + imageGCThresholdHigh: 40e9 + imageGCThresholdLow: 30e9 + imageGCThresholdType: "absolute" + +cryptnono: + enabled: false + +grafana: + nodeSelector: *coreNodeSelector + ingress: + hosts: + - grafana.ovh2.mybinder.org + tls: + - hosts: + - grafana.ovh2.mybinder.org + secretName: kubelego-tls-grafana + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + orgId: 1 + type: prometheus + url: https://prometheus.ovh2.mybinder.org + access: direct + isDefault: true + editable: false + persistence: + storageClassName: csi-cinder-high-speed + +prometheus: + server: + nodeSelector: *coreNodeSelector + persistentVolume: + size: 50Gi + retention: 30d + ingress: + hosts: + - prometheus.ovh2.mybinder.org + tls: + - hosts: + - prometheus.ovh2.mybinder.org + secretName: kubelego-tls-prometheus + +ingress-nginx: + controller: + scope: + enabled: true + service: + loadBalancerIP: 162.19.17.37 + +static: + ingress: + hosts: + - static.ovh2.mybinder.org + tls: + secretName: kubelego-tls-static diff --git a/deploy.py b/deploy.py index cb7b9a6b5..3eb00d61d 100755 --- a/deploy.py +++ b/deploy.py @@ -76,7 +76,7 @@ def setup_auth_ovh(release, cluster): """ print(f"Setup the OVH authentication for namespace {release}") - ovh_kubeconfig = os.path.join(ABSOLUTE_HERE, "secrets", "ovh-kubeconfig.yml") + ovh_kubeconfig = os.path.join(ABSOLUTE_HERE, "secrets", f"{release}-kubeconfig.yml") os.environ["KUBECONFIG"] = ovh_kubeconfig print(f"Current KUBECONFIG='{ovh_kubeconfig}'") stdout = subprocess.check_output(["kubectl", "config", "use-context", cluster]) @@ -124,7 +124,7 @@ def update_networkbans(cluster): # some members have special logic in ban.py, # in which case they must be specified on the command-line ban_command = [sys.executable, "secrets/ban.py"] - if cluster in {"turing-prod", "turing-staging", "turing", "ovh"}: + if cluster in {"turing-prod", "turing-staging", "turing", "ovh", "ovh2"}: ban_command.append(cluster) subprocess.check_call(ban_command) @@ -245,13 +245,43 @@ def setup_certmanager(): subprocess.check_call(helm_upgrade) +def patch_coredns(): + """Patch coredns resource allocation + + OVH2 coredns does not have sufficient memory by default after our ban patches + """ + print(BOLD + GREEN + "Patching coredns resources" + NC, flush=True) + subprocess.check_call( + [ + "kubectl", + "set", + "resources", + "-n", + "kube-system", + "deployments/coredns", + "--limits", + "memory=250Mi", + "--requests", + "memory=200Mi", + ] + ) + + def main(): # parse command line args argparser = argparse.ArgumentParser() argparser.add_argument( "release", help="Release to deploy", - choices=["staging", "prod", "ovh", "turing-prod", "turing-staging", "turing"], + choices=[ + "staging", + "prod", + "ovh", + "ovh2", + "turing-prod", + "turing-staging", + "turing", + ], ) argparser.add_argument( "--name", @@ -302,8 +332,9 @@ def main(): # script is running on CI, proceed with auth and helm setup - if cluster == "ovh": + if cluster.startswith("ovh"): setup_auth_ovh(args.release, cluster) + patch_coredns() elif cluster in AZURE_RGs: setup_auth_turing(cluster) elif cluster in GCP_PROJECTS: diff --git a/secrets/ban.py b/secrets/ban.py index ea50f46d1..3a0b2bf66 100755 Binary files a/secrets/ban.py and b/secrets/ban.py differ diff --git a/secrets/config/ovh2.yaml b/secrets/config/ovh2.yaml new file mode 100644 index 000000000..72f7809c4 Binary files /dev/null and b/secrets/config/ovh2.yaml differ diff --git a/secrets/ovh2-kubeconfig.yml b/secrets/ovh2-kubeconfig.yml new file mode 100644 index 000000000..432e38bd9 Binary files /dev/null and b/secrets/ovh2-kubeconfig.yml differ diff --git a/terraform/README.md b/terraform/README.md index 292318a36..2af45e681 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -1,6 +1,6 @@ # Terraform deployment info -Common configuration is in terraform/modules/mybinder +Common configuration for GKE is in terraform/modules/mybinder most deployed things are in mybinder/resource.tf variables (mostly things that should differ in staging/prod) in mybinder/variables.tf @@ -49,7 +49,7 @@ terraform output -json private_keys | jq '.["events-archiver"]' | pbcopy with key names: "events-archiver", "matomo", and "binderhub-builder" and paste them into the appropriate fields in `secrets/config/$deployment.yaml`. -### Notes +## Notes - requesting previously-allocated static ip via loadBalancerIP did not work. Had to manually mark LB IP as static via cloud console. @@ -57,3 +57,19 @@ with key names: "events-archiver", "matomo", and "binderhub-builder" and paste t - sql admin API needed to be manually enabled [here](https://console.developers.google.com/apis/library/sqladmin.googleapis.com) - matomo sql data was manually imported/exported via sql dashboard and gsutil in cloud console - events archive history was manually migrated via `gsutil -m rsync` in cloud console + +## OVH + +The new OVH cluster is also deployed via terraform in the `ovh` directory. +This has a lot less to deploy than flagship GKE, +but deploys a Harbor (container image) registry as well. + +### OVH Notes + +- credentials are in `terraform/secrets/ovh-creds.py` +- token in credentials is owned by Min because OVH tokens are always owned by real OVH users, not per-project 'service account'. + The token only has permissions on the MyBinder cloud project, however. +- the only manual creation step was the s3 bucket and user for terraform state, the rest is created with terraform +- harbor registry on OVH is old, and this forces us to use an older + harbor _provider_. + Once OVH upgrades harbor to at least 2.2 (2.4 expected in 2022-12), we should be able to upgrade the harbor provider and robot accounts. diff --git a/terraform/ovh/main.tf b/terraform/ovh/main.tf new file mode 100644 index 000000000..4c637205d --- /dev/null +++ b/terraform/ovh/main.tf @@ -0,0 +1,290 @@ +terraform { + required_providers { + ovh = { + source = "ovh/ovh" + version = "~> 0.22.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.3.2" + } + harbor = { + source = "BESTSELLER/harbor" + # can't use 3.0, which requires harbor 2.2 for robot accounts + # OVH deploys 2.0.1 + version = "~> 2.0.11" + } + } + # store state on gcs, like other clusters + backend "s3" { + bucket = "tf-state-ovh" + key = "terraform.tfstate" + region = "gra" + endpoint = "s3.gra.io.cloud.ovh.net" + skip_credentials_validation = true + skip_region_validation = true + } +} + +provider "ovh" { + endpoint = "ovh-eu" + # credentials loaded via source ../secrets/ovh-creds.sh +} + +locals { + service_name = "b309c78177f1458187add722e8db8dc2" + cluster_name = "ovh2" + # GRA9 is colocated with registry + region = "GRA9" +} + +# create a private network for our cluster +resource "ovh_cloud_project_network_private" "network" { + service_name = local.service_name + name = local.cluster_name + regions = [local.region] +} + +resource "ovh_cloud_project_network_private_subnet" "subnet" { + service_name = local.service_name + network_id = ovh_cloud_project_network_private.network.id + + region = local.region + start = "10.0.0.100" + end = "10.0.0.254" + network = "10.0.0.0/24" + dhcp = true +} + +resource "ovh_cloud_project_kube" "cluster" { + service_name = local.service_name + name = local.cluster_name + region = local.region + version = "1.24" + # make sure we wait for the subnet to exist + depends_on = [ovh_cloud_project_network_private_subnet.subnet] + + # private_network_id is an openstackid for some reason? + private_network_id = tolist(ovh_cloud_project_network_private.network.regions_attributes)[0].openstackid + + customization { + apiserver { + admissionplugins { + enabled = ["NodeRestriction"] + # disable AlwaysPullImages, which causes problems + disabled = ["AlwaysPullImages"] + } + } + } + update_policy = "MINIMAL_DOWNTIME" +} + +# ovh node flavors: https://www.ovhcloud.com/en/public-cloud/prices/ + +resource "ovh_cloud_project_kube_nodepool" "core" { + service_name = local.service_name + kube_id = ovh_cloud_project_kube.cluster.id + name = "core-202211" + # b2-15 is 4 core, 15GB + flavor_name = "b2-15" + desired_nodes = 1 + max_nodes = 3 + min_nodes = 1 + autoscale = true + template { + metadata { + labels = { + "mybinder.org/pool-type" = "core" + } + } + } +} + +resource "ovh_cloud_project_kube_nodepool" "user" { + service_name = local.service_name + kube_id = ovh_cloud_project_kube.cluster.id + name = "user-202211" + # r2-60 is 4 core, 60GB + flavor_name = "r2-60" + desired_nodes = 1 + max_nodes = 6 + min_nodes = 1 + autoscale = true + template { + metadata { + labels = { + "mybinder.org/pool-type" = "users" + } + } + } +} + +# outputs + +output "kubeconfig" { + value = ovh_cloud_project_kube.cluster.kubeconfig + sensitive = true + description = < $KUBECONFIG + chmod 600 $KUBECONFIG + kubectl config rename-context kubernetes-admin@ovh2 ovh2 + kubectl config use-context ovh2 + EOF +} + +# registry + +data "ovh_cloud_project_capabilities_containerregistry_filter" "registry_plan" { + service_name = local.service_name + # SMALL is 200GB (too small) + # MEDIUM is 600GB + # Large is 5TiB + plan_name = "MEDIUM" + region = "GRA" +} + +resource "ovh_cloud_project_containerregistry" "registry" { + service_name = local.service_name + plan_id = data.ovh_cloud_project_capabilities_containerregistry_filter.registry_plan.id + region = data.ovh_cloud_project_capabilities_containerregistry_filter.registry_plan.region + name = "mybinder-ovh" +} + +# admin user (needed for harbor provider) +resource "ovh_cloud_project_containerregistry_user" "admin" { + service_name = ovh_cloud_project_containerregistry.registry.service_name + registry_id = ovh_cloud_project_containerregistry.registry.id + email = "mybinder-admin@mybinder.org" + login = "mybinder-admin" +} + + +# now configure the registry via harbor itself +provider "harbor" { + url = ovh_cloud_project_containerregistry.registry.url + username = ovh_cloud_project_containerregistry_user.admin.login + password = ovh_cloud_project_containerregistry_user.admin.password +} + +# chart images go in mybinder-chart +resource "harbor_project" "mybinder-chart" { + name = "mybinder-chart" + # chart images need to be public + # because we can't have two pull secrets for one registry, + # and harbor < 2.2 can't grant read-only access to more than one project + # on the same registry + public = true +} + +# user builds go in mybinder-builds +# these are separate for easier separation of retention policies +resource "harbor_project" "mybinder-builds" { + name = "mybinder-builds" +} + + +# TODO: robot accounts change with harbor 2.2 / harbor-provider 3.0 +# in particular, we can drop the two separate pullers +resource "harbor_robot_account" "chartpress" { + name = "chartpress" + description = "mybinder chartpress: access to push new chart images" + project_id = harbor_project.mybinder-chart.id + actions = ["push", "pull"] +} + +resource "harbor_robot_account" "chart-puller" { + name = "chart-puller" + description = "pull mybinder chart images" + project_id = harbor_project.mybinder-chart.id + actions = ["pull"] +} + +resource "harbor_robot_account" "builder" { + name = "builder" + description = "BinderHub builder: push new user images" + project_id = harbor_project.mybinder-builds.id + actions = ["push", "pull"] +} + +resource "harbor_robot_account" "user-puller" { + name = "user-puller" + description = "Pull access to user images" + project_id = harbor_project.mybinder-builds.id + actions = ["pull"] +} + +# retention policies created by hand +# OVH harbor is too old for terraform provider (2.0.1, need 2.2) +# resource "harbor_retention_policy" "user" { +# scope = harbor_project.mybinder-builds.id +# schedule = "weekly" +# rule { +# most_recently_pulled = 1 +# } +# rule { +# n_days_since_last_pull = 30 +# } +# rule { +# n_days_since_last_push = 7 +# } +# } +# +# resource "harbor_retention_policy" "chart" { +# scope = harbor_project.mybinder-chart.id +# schedule = "weekly" +# # keep the most recent 5 versions +# # (by both push and pull, which should usually be the same) +# rule { +# most_recently_pulled = 5 +# } +# rule { +# most_recently_pushed = 5 +# } +# rule { +# n_days_since_last_push = 7 +# } +# } + +resource "harbor_garbage_collection" "gc" { + schedule = "weekly" + delete_untagged = true + +} + +# registry outputs + +output "registry_url" { + value = ovh_cloud_project_containerregistry.registry.url +} + +output "registry_admin_login" { + value = ovh_cloud_project_containerregistry_user.admin.login + sensitive = true +} + +output "registry_admin_password" { + value = ovh_cloud_project_containerregistry_user.admin.password + sensitive = true +} + +output "registry_chartpress_token" { + value = harbor_robot_account.chartpress.token + sensitive = true +} + +output "registry_chart_puller_token" { + value = harbor_robot_account.chart-puller.token + sensitive = true +} + +output "registry_builder_token" { + value = harbor_robot_account.builder.token + sensitive = true +} + +output "registry_user_puller_token" { + value = harbor_robot_account.user-puller.token + sensitive = true +} diff --git a/terraform/secrets/ovh-creds.sh b/terraform/secrets/ovh-creds.sh new file mode 100644 index 000000000..122e8c162 Binary files /dev/null and b/terraform/secrets/ovh-creds.sh differ