diff --git a/hack/boskos.py b/hack/boskos.py new file mode 100644 index 000000000000..28a2814160a6 --- /dev/null +++ b/hack/boskos.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os + +import requests +import time + +BOSKOS_HOST = os.environ.get("BOSKOS_HOST", "boskos") +BOSKOS_RESOURCE_NAME = os.environ.get('BOSKOS_RESOURCE_NAME') + + +def checkout_account_request(resource_type, user, input_state): + url = f'http://{BOSKOS_HOST}/acquire?type={resource_type}&state={input_state}&dest=busy&owner={user}' + r = requests.post(url) + status = r.status_code + reason = r.reason + result = "" + + if status == 200: + content = r.content.decode() + result = json.loads(content) + + return status, reason, result + + +def checkout_account(resource_type, user): + status, reason, result = checkout_account_request(resource_type, user, "clean") + # TODO(sbueringer): find out if we still need this + # replicated the implementation of cluster-api-provider-gcp + # we're working around an issue with the data in boskos. + # We'll remove the code that tries both free and clean once all the data is good. + # Afterwards we should just check for free + if status == 404: + status, reason, result = checkout_account_request(resource_type, user, "free") + + if status != 200: + raise Exception(f"Got invalid response {status}: {reason}") + + print(f"export BOSKOS_RESOURCE_NAME={result['name']}") + print(f"export GCP_PROJECT={result['name']}") + + +def release_account(user): + url = f'http://{BOSKOS_HOST}/release?name={BOSKOS_RESOURCE_NAME}&dest=dirty&owner={user}' + + r = requests.post(url) + + if r.status_code != 200: + raise Exception(f"Got invalid response {r.status_code}: {r.reason}") + + +def send_heartbeat(user): + url = f'http://{BOSKOS_HOST}/update?name={BOSKOS_RESOURCE_NAME}&state=busy&owner={user}' + + while True: + print(f"POST-ing heartbeat for resource {BOSKOS_RESOURCE_NAME} to {BOSKOS_HOST}") + r = requests.post(url) + + if r.status_code == 200: + print(f"response status: {r.status_code}") + else: + print(f"Got invalid response {r.status_code}: {r.reason}") + + time.sleep(60) + + +def main(): + parser = argparse.ArgumentParser(description='Boskos GCP Account Management') + + parser.add_argument( + '--get', dest='checkout_account', action="store_true", + help='Checkout a Boskos GCP Account' + ) + + parser.add_argument( + '--release', dest='release_account', action="store_true", + help='Release a Boskos GCP Account' + ) + + parser.add_argument( + '--heartbeat', dest='send_heartbeat', action="store_true", + help='Send heartbeat for the checked out a Boskos GCP Account' + ) + + parser.add_argument( + '--resource-type', dest="resource_type", type=str, + default="gce-project", + help="Type of Boskos resource to manage" + ) + + parser.add_argument( + '--user', dest="user", type=str, + default="cluster-api", + help="username" + ) + + args = parser.parse_args() + + if args.checkout_account: + checkout_account(args.resource_type, args.user) + + elif args.release_account: + release_account(args.user) + + elif args.send_heartbeat: + send_heartbeat(args.user) + + +if __name__ == "__main__": + main() diff --git a/hack/remote/README.md b/hack/remote/README.md new file mode 100644 index 000000000000..367a417fda6d --- /dev/null +++ b/hack/remote/README.md @@ -0,0 +1,162 @@ + +# TODOs: + +* Test on MacOS + +* go over all files in the diff, finalize + FIXME /TODOs + +* Get it to work on Prow + * Test & fixup GCP script + +Backlog: +* Optimize scripting / automation + * Implement in Go? +* MacOS: Debug why it crashes the local Docker Desktop + * try without IPv6: docker network create -d=bridge -o com.docker.network.bridge.enable_ip_masquerade=true -o com.docker.network.driver.mtu=1500 --subnet=172.24.4.0/24 --gateway=172.24.4.1 kind + * => re-add IPv6 + +# Setting up a Docker engine on AWS: + +Prerequisites: +* AWS CLI must be installed & configured with credentials + +Setup server on AWS with Docker engine: +```bash +./hack/remote/setup-docker-on-aws-account.sh +``` + +Note: The script can also be run repeatedly, e.g. to create the ssh tunnel when the server already exists. + +# Use remote Docker engine + +## Docker CLI + +```bash +export DOCKER_HOST=tcp://10.0.3.15:2375 +docker version +docker info +``` + +## Local management cluster + +### e2e tests via IDE + +Prerequisites: +```bash +make generate-e2e-templates +make docker-build-e2e +``` + +Run configuration: +* Add to environment: `CAPD_DOCKER_HOST=tcp://10.0.3.15:2375` + +### Tilt + +tilt-settings.yaml: +```yaml +kustomize_substitutions: + # Use remote Docker host in CAPD. + CAPD_DOCKER_HOST: "tcp://10.0.3.15:2375" +``` + +```bash +tilt up +``` + +### Quickstart + +```bash +export CAPD_DOCKER_HOST="tcp://10.0.3.15:2375" +``` + +## Remote management cluster + +Create remote kind cluster: +```bash +# SSH to server +ssh-add ~/.ssh/aws-capi-docker +ssh cloud@${SERVER_PUBLIC_IP} +sudo su + +# Note: this has to be run on the server. +# Running it locally will fails because 10.0.3.15 is not a valid IP there. +kind create cluster --name=capi-test --config=${HOME}/kind.yaml +``` + +### e2e tests via IDE + +Prerequisites: +```bash +make generate-e2e-templates + +# If local images are required (e.g. because code has changed) +export DOCKER_HOST=tcp://10.0.3.15:2375 +make docker-build-e2e +kind load docker-image --name=capi-test gcr.io/k8s-staging-cluster-api/cluster-api-controller-amd64:dev +kind load docker-image --name=capi-test gcr.io/k8s-staging-cluster-api/kubeadm-bootstrap-controller-amd64:dev +kind load docker-image --name=capi-test gcr.io/k8s-staging-cluster-api/kubeadm-control-plane-controller-amd64:dev +kind load docker-image --name=capi-test gcr.io/k8s-staging-cluster-api/capd-manager-amd64:dev +kind load docker-image --name=capi-test gcr.io/k8s-staging-cluster-api/test-extension-amd64:dev +``` + +Run configuration: +* Add to environment: `DOCKER_HOST=tcp://10.0.3.15:2375;CAPD_DOCKER_HOST=tcp://10.0.3.15:2375` +* Add to program arguments: `-e2e.use-existing-cluster=true` + +### Tilt + +tilt-settings.yaml: +```yaml +kustomize_substitutions: + # Use remote Docker host in CAPD. + CAPD_DOCKER_HOST: "tcp://10.0.3.15:2375" +``` + +```bash +export DOCKER_HOST=tcp://10.0.3.15:2375 +tilt up +``` + +FIXME(sbueringer): enable local registry +* let's check if it is faster (as redeploy also just copies the binary over) +* copy&paste kind-install-for-capd.sh script over(?) (already done => just test it) +* ensure registry is reachable from local machine + +## Getting access to workload clusters + +Retrieve kubeconfig for workload clusters via: +```bash +clusterctl get kubeconfig capi-quickstart > /tmp/kubeconfig +kubectl --kubeconfig /tmp/kubeconfig get no,po -A -A +``` +Note: The kubeconfigs returned by `kind get kubeconfig` don't work. + +# Troubleshooting + +Verify connectivity: + +```bash +# SSH to server +ssh-add ~/.ssh/aws-capi-docker +ssh cloud@${SERVER_PUBLIC_IP} + +# On the server: +nc -l 10.0.3.15 8005 + +# Locally: +nc 10.0.3.15 8005 +``` + +# Tested scenarios + +* Local mgmt cluster: + * Tilt: + * works well + * e2e tests (via Intellij): + * works well +* Remote mgmt cluster: + * Tilt: + * loading images via kind load is slow + * e2e tests (via Intellij): + * building e2e images with make is quick + * loading images with kind load is slow diff --git a/hack/remote/cloud-init.yaml.tpl b/hack/remote/cloud-init.yaml.tpl new file mode 100644 index 000000000000..46a4990fa2af --- /dev/null +++ b/hack/remote/cloud-init.yaml.tpl @@ -0,0 +1,169 @@ +#cloud-config +runcmd: +- /root/setup.sh +final_message: "The system is finally up, after $UPTIME seconds" +users: +- name: cloud + lock_passwd: true + sudo: ALL=(ALL) NOPASSWD:ALL + ssh_authorized_keys: + - ${SSH_PUBLIC_KEY} +# Infrastructure packages required: +# python3 - required by sshuttle +# jq - for convenience +packages: +- python3 +- jq +write_files: +- path: /etc/systemd/system/docker.service.d/override.conf + permissions: 0644 + content: | + # Disable flags to dockerd, all settings are done in /etc/docker/daemon.json + [Service] + ExecStart= + ExecStart=/usr/bin/dockerd +- path: /etc/docker/daemon.json + permissions: 0755 + # Note: We had to disable command line flags in the Docker systemd unit with the override file, + # because otherwise the hosts flag would have been set twice and Docker fails to start up. + # Because we entirely disable flags in the default Docker systemd unit we also have to set + # "containerd" in daemon.json + content: | + { + "hosts": ["tcp://${SERVER_PRIVATE_IP}:2375", "unix:///var/run/docker.sock"], + "tls": false + } + # FIXME:(sbueringer) just a test +- path: /usr/local/bin/kind-install-for-capd.sh + permissions: 0755 + content: | + #!/usr/bin/env bash + + # Copyright 2021 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + + # This script installs a local kind cluster with a local container registry and the correct files mounted for using CAPD + # to test Cluster API. + # This script is a customized version of the kind_with_local_registry script supplied by the kind maintainers at + # https://kind.sigs.k8s.io/docs/user/local-registry/ + # The modifications mount the docker socket inside the kind cluster so that CAPD can be used to + # created docker containers. + + set -o errexit + set -o nounset + set -o pipefail + + if [[ "${TRACE-0}" == "1" ]]; then + set -o xtrace + fi + + KIND_CLUSTER_NAME=${CAPI_KIND_CLUSTER_NAME:-"capi-test"} + + if [[ "$(kind get clusters)" =~ .*"${KIND_CLUSTER_NAME}".* ]]; then + echo "kind cluster already exists, moving on" + exit 0 + fi + + # create registry container unless it already exists + reg_name='kind-registry' + reg_port='5000' + running="$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" + if [ "${running}" != 'true' ]; then + docker run \ + -d --restart=always -p "10.0.3.15:${reg_port}:5000" --name "${reg_name}" \ + registry:2 + fi + + # create a cluster with the local registry enabled in containerd + cat < /root/kind.yaml + kind: Cluster + apiVersion: kind.x-k8s.io/v1alpha4 + networking: + apiServerAddress: "10.0.3.15" + nodes: + - role: control-plane + extraMounts: + - hostPath: /var/run/docker.sock + containerPath: /var/run/docker.sock + EOF diff --git a/hack/remote/setup-docker-lib.sh b/hack/remote/setup-docker-lib.sh new file mode 100755 index 000000000000..132faa342e88 --- /dev/null +++ b/hack/remote/setup-docker-lib.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(git rev-parse --show-toplevel) + +export ARTIFACTS="${ARTIFACTS:-${REPO_ROOT}/_artifacts}" + +# Setup output directory for the docker server. +ARTIFACTS_DOCKER_SERVER="${ARTIFACTS}/docker-server" +mkdir -p "${ARTIFACTS_DOCKER_SERVER}" +echo "This folder contains files for the docker server." > "${ARTIFACTS_DOCKER_SERVER}/README.md" + +SSHUTTLE_PIDFILE="${ARTIFACTS_DOCKER_SERVER}/sshuttle.pid" + +# retry retries a command $1 times with $2 sleep in between +# Example: retry 10 30 echo test +function retry { + local attempt=0 + local max_attempts=${1} + local interval=${2} + shift; shift + until [[ "$attempt" -ge "$max_attempts" ]] ; do + attempt=$((attempt+1)) + set +e + eval "$*" && return || echo "failed $attempt times: $*" + set -e + sleep "$interval" + done + echo "error: reached max attempts at retry($*)" + return 1 +} + +# get_ssh_cmd calculates the ssh cmd command based on +# SSH_PRIVATE_KEY_FILE and SSH_PUBLIC_KEY_FILE. +# Example: get_ssh_cmd $private_key_file $public_key_file +function get_ssh_cmd { + local private_key_file=$1 && shift + local public_key_file=$1 && shift + + local key_file=${private_key_file} + if [ -z "$key_file" ]; then + # If there's no private key file use the public key instead + # This allows us to specify a private key which is held only on a + # hardware device and therefore has no key file + key_file=${public_key_file} + fi + + # Note: LogLevel=ERROR hides warnings. + echo "ssh -i ${key_file} -l cloud " \ + "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o IdentitiesOnly=yes -o PasswordAuthentication=no -o LogLevel=ERROR " +} + +# wait_for_ssh waits until ssh is available on IP $1 +# Example: wait_for_ssh $ip $ssh_cmd +function wait_for_ssh_available { + echo -e "# Wait until ssh is available \n" + + local ip=$1 && shift + local ssh_cmd=$1 && shift + + retry 10 30 "${ssh_cmd} ${ip} -- true" + echo "" +} + +# start_sshuttle starts sshuttle +# Note: If necessary it also install sshuttle. +# Example: start_sshuttle $public_ip $private_network_cir $server_kind_subnet $ssh_cmd +function start_sshuttle { + echo -e "# Start sshuttle \n" + + local public_ip=$1 && shift + local private_network_cir=$1 && shift + local server_kind_subnet=$1 && shift + local ssh_cmd=$1 && shift + + # Install sshuttle if it isn't already installed. + if ! command -v sshuttle > /dev/null; + then + echo -e "Install sshuttle\n" + pip3 install sshuttle + fi + + # Kill sshuttle if it is already running + # Note: This depends on ${SSHUTTLE_PIDFILE}. + stop_sshuttle + + # Wait until ssh is available. + wait_for_ssh_available "${public_ip}" "${ssh_cmd}" + + # Open tunnel. + echo "Opening tunnel for CIDRs: ${private_network_cir} and ${server_kind_subnet} (via ${public_ip})" + # sshuttle won't succeed until ssh is up and python is installed on the destination + retry 30 20 sshuttle -r "${public_ip}" \ + "${private_network_cir}" \ + "${server_kind_subnet}" \ + --ssh-cmd=\""${ssh_cmd}"\" \ + -l 0.0.0.0 -D \ + --pidfile "${SSHUTTLE_PIDFILE}" + + # Give sshuttle a few seconds to be fully up + sleep 5 + echo "" +} + +# stop_sshuttle kills sshuttle +# Note: This depends on ${SSHUTTLE_PIDFILE}. +# Example: stop_sshuttle +function stop_sshuttle { + echo -e "# Stop sshuttle (if running)\n" + + if [ -f "${SSHUTTLE_PIDFILE}" ]; then + local sshuttle_pid + sshuttle_pid=$(cat "${SSHUTTLE_PIDFILE}") + echo "Stopping sshuttle with PID ${sshuttle_pid}" + kill "${sshuttle_pid}" + while [ -d "/proc/${sshuttle_pid}" ]; do + echo "Waiting for sshuttle to stop" + sleep 1 + done + else + echo "PID file ${SSHUTTLE_PIDFILE} does not exist, skip stopping sshuttle" + fi + + echo "" +} + +# wait_for_cloud_init waits until cloud init is completed and retrieve logs. +# Example: wait_for_cloud_init $ip $ssh_cmd +function wait_for_cloud_init { + echo -e "# Wait for cloud init \n" + + local ip=$1 && shift + local ssh_cmd=$1 && shift + + # Wait until cloud-final is either failed or active. + $ssh_cmd "$ip" -- " + echo 'Waiting for cloud-final to complete\n' + start=\$(date -u +%s) + while true; do + systemctl --quiet is-failed cloud-final && exit 1 + systemctl --quiet is-active cloud-final && exit 0 + echo Waited \$(((\$(date -u +%s)-\$start)/60)) minutes + echo "" + sleep 30 + done" + + # Flush the journal to ensure we get the final logs of cloud-final if it died. + $ssh_cmd "$ip" -- sudo journalctl --flush + + # Capture logs of cloud-init services + for service in cloud-config cloud-final cloud-init-local cloud-init; do + echo -e "[${service}] Get logs and check status" + $ssh_cmd "$ip" -- sudo journalctl -a -b -u "$service" > "${ARTIFACTS_DOCKER_SERVER}/${service}.log" + + # Fail early if any cloud-init service failed + $ssh_cmd "$ip" -- sudo systemctl status --full "$service" > "${ARTIFACTS_DOCKER_SERVER}/${service}-status.txt" || \ + { + echo -e "[${service}] failed" + echo -e "\nStatus:" + cat "${ARTIFACTS_DOCKER_SERVER}/${service}.log" + echo -e "\nLogs:" + cat "${ARTIFACTS_DOCKER_SERVER}/${service}-status.txt" + exit 1 + } + done + + echo "" +} + +# template_cloud_init_file templates the cloud init file and +# prints the file location. +# Example: template_cloud_init_file $server_private_ip $public_key_file +function template_cloud_init_file { + local server_private_ip=$1 && shift + local public_key_file=$1 && shift + + cloud_init_file="${ARTIFACTS_DOCKER_SERVER}/cloud-init.yaml" + + # Ensure cloud init file exists and is empty. + echo "" > "$cloud_init_file" + + # Render cloud init file. + # shellcheck disable=SC2016,SC2086,SC2153 + SERVER_PRIVATE_IP="${server_private_ip}" \ + SSH_PUBLIC_KEY="$(cat ${public_key_file})" \ + SERVER_KIND_SUBNET="${SERVER_KIND_SUBNET}" \ + SERVER_KIND_GATEWAY="${SERVER_KIND_GATEWAY}" \ + envsubst '${SERVER_PRIVATE_IP} ${SSH_PUBLIC_KEY} ${SERVER_KIND_SUBNET} ${SERVER_KIND_GATEWAY}' \ + < "./hack/remote/cloud-init.yaml.tpl" >> "$cloud_init_file" + + echo "${cloud_init_file}" +} diff --git a/hack/remote/setup-docker-on-aws-account.sh b/hack/remote/setup-docker-on-aws-account.sh new file mode 100755 index 000000000000..086c709fac0a --- /dev/null +++ b/hack/remote/setup-docker-on-aws-account.sh @@ -0,0 +1,260 @@ +#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "${REPO_ROOT}" || exit 1 + +# shellcheck source=./hack/remote/setup-docker-lib.sh +source "${REPO_ROOT}/hack/remote/setup-docker-lib.sh" + +####################################################### +echo -e "# Server Configuration\n" +####################################################### +# SERVER_NAME is the name of the server. +SERVER_NAME=${SERVER_NAME:-"capi-docker"} +# SERVER_PRIVATE_IP is the private IP of the server. +# Note: Must be inside of AWS_NETWORK_CIDR. +# Note: We will be able to access this IP directly because we will open a +# tunnel to AWS_NETWORK_CIDR with sshuttle. +SERVER_PRIVATE_IP=${SERVER_PRIVATE_IP:-"10.0.3.15"} +echo -e " SERVER_NAME: ${SERVER_NAME}\n SERVER_PRIVATE_IP: ${SERVER_PRIVATE_IP}" +# SERVER_KIND_SUBNET is the subnet of the kind network on the server. +# Note: We will be able to access this network directly because we will open a +# tunnel to SERVER_KIND_SUBNET with sshuttle. This includes all container running +# in this network. +SERVER_KIND_SUBNET=${SERVER_KIND_SUBNET:-"172.24.0.0/16"} +# SERVER_KIND_GATEWAY is the gateway of the kind network on the server. +SERVER_KIND_GATEWAY=${SERVER_KIND_GATEWAY:-"172.24.0.1"} +echo -e " SERVER_KIND_SUBNET: ${SERVER_KIND_SUBNET}\n SERVER_KIND_GATEWAY: ${SERVER_KIND_GATEWAY}" +echo "" +####################################################### + +####################################################### +echo -e "# AWS Configuration\n" +####################################################### +# AWS_REGION is the AWS region. +# FIXME(sbueringer): cleanup ap-southeast region, zone, .. +AWS_REGION=${AWS_REGION:-"eu-central-1"} +#AWS_REGION=${AWS_REGION:-"ap-southeast-1"} +# AWS_ZONE is the AWS zone. +AWS_ZONE=${AWS_ZONE:-"eu-central-1a"} +#AWS_ZONE=${AWS_ZONE:-"ap-southeast-1a"} +# AWS_NETWORK_NAME is the name of the VPC and all the network +# objects we create in the VPC. +AWS_NETWORK_NAME=${AWS_NETWORK_NAME:-"${SERVER_NAME}"} +# AWS_NETWORK_CIDR is the CIDR of the AWS network. +# Note: The server will be part of this network. +# Note: We will be able to access this network directly because we will open a +# tunnel to AWS_NETWORK_CIDR with sshuttle. +AWS_NETWORK_CIDR=${AWS_NETWORK_CIDR:-"10.0.3.0/24"} +echo -e " AWS_REGION: ${AWS_REGION}\n AWS_ZONE: ${AWS_ZONE}\n AWS_NETWORK_NAME: ${AWS_NETWORK_NAME}\n AWS_NETWORK_CIDR: ${AWS_NETWORK_CIDR}" +# AWS_MACHINE_TYPE is the machine type for the server. +# Choose via: https://eu-central-1.console.aws.amazon.com/ec2/v2/home?region=eu-central-1#InstanceTypes +# For example: +# * c5.4xlarge 16 vCPU 32 GB RAM => ~ 0.776 USD per hour +# * c5.12xlarge 48 vCPU 96 GB RAM => ~ 2.328 USD per hour +AWS_MACHINE_TYPE=${AWS_MACHINE_TYPE:-"c5.4xlarge"} +# AWS_AMI is the AMI we will use for the server. +# AMIs: +# * Canonical, Ubuntu, 22.04 LTS, amd64 jammy image build on 2023-02-08 id: +# * eu-central-1: ami-0d1ddd83282187d18 +# * ap-southeast-1: ami-082b1f4237bd816a1 +# FIXME(sbueringer) +AWS_AMI=${AWS_AMI:-"ami-0d1ddd83282187d18"} +#AWS_AMI=${AWS_AMI:-"ami-082b1f4237bd816a1"} +echo -e " AWS_MACHINE_TYPE: ${AWS_MACHINE_TYPE}\n AWS_AMI: ${AWS_AMI}" +# AWS_KEY_PAIR is the key pair we use to access the server. +# Prepare key pair with: +# # Create key pair: +# aws ec2 create-key-pair --key-name capi-docker --query 'KeyMaterial' --region "${AWS_REGION}" --output text > ${AWS_KEY_PAIR_PRIVATE_KEY_FILE} +# chmod 0400 ${AWS_KEY_PAIR_PRIVATE_KEY_FILE} +# # Add to key to local ssh agent and generate public key: +# ssh-add ${AWS_KEY_PAIR_PRIVATE_KEY_FILE} +# ssh-keygen -y -f ${AWS_KEY_PAIR_PRIVATE_KEY_FILE} > ${AWS_KEY_PAIR_PUBLIC_KEY_FILE} +AWS_KEY_PAIR=${AWS_KEY_PAIR:-"capi-docker"} +# AWS_KEY_PAIR_PUBLIC_KEY_FILE is the public key file. +# Note: This key file will be added to authorized keys of the cloud user on the server. +AWS_KEY_PAIR_PUBLIC_KEY_FILE=${AWS_KEY_PAIR_PUBLIC_KEY_FILE:-"${HOME}/.ssh/aws-capi-docker.pub"} +# AWS_KEY_PAIR_PUBLIC_KEY_FILE is the private key file. +# Note: This key file will be used in the ssh cmd to access the server. +AWS_KEY_PAIR_PRIVATE_KEY_FILE=${AWS_KEY_PAIR_PRIVATE_KEY_FILE:-"${HOME}/.ssh/aws-capi-docker"} +# AWS_SSH_CMD is the ssh cmd we use to access the server. +AWS_SSH_CMD=$(get_ssh_cmd "${AWS_KEY_PAIR_PRIVATE_KEY_FILE}" "${AWS_KEY_PAIR_PUBLIC_KEY_FILE}") +echo -e " AWS_KEY_PAIR: ${AWS_KEY_PAIR}\n AWS_KEY_PAIR_PUBLIC_KEY_FILE: ${AWS_KEY_PAIR_PUBLIC_KEY_FILE}\n AWS_KEY_PAIR_PRIVATE_KEY_FILE: ${AWS_KEY_PAIR_PRIVATE_KEY_FILE}" +echo "" +# Disable pagination of AWS CLI. +export AWS_PAGER="" +####################################################### + +# init_infrastructure creates the basic infrastructure: +# * VPC, Subnet, Security Group, Internet gateway and Routes +# Note: This also allows ingress traffic on port 22 for ssh access (via the security group). +# Example: create_infrastructure +function create_infrastructure() { + echo -e "# Create Infrastructure \n" + + if [[ ${AWS_NETWORK_NAME} != "default" ]]; then + if [[ $(aws ec2 describe-vpcs --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region="${AWS_REGION}" --query 'length(*[0])') = "0" ]]; + then + # Create VPC. + echo "Create VPC with name ${AWS_NETWORK_NAME}" + aws ec2 create-vpc --cidr-block "${AWS_NETWORK_CIDR}" --tag-specifications "ResourceType=vpc,Tags=[{Key=Name,Value=${AWS_NETWORK_NAME}}]" --region="${AWS_REGION}" + # Get VPC ID. + local aws_vpc_id + aws_vpc_id=$(aws ec2 describe-vpcs --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region "${AWS_REGION}" --query '*[0].VpcId' --output text) + + # Create subnet. + echo "Create subnet with name ${AWS_NETWORK_NAME}" + aws ec2 create-subnet --cidr-block "${AWS_NETWORK_CIDR}" --vpc-id "${aws_vpc_id}" --tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=${AWS_NETWORK_NAME}}]" --region "${AWS_REGION}" --availability-zone "${AWS_ZONE}" + # Get route table ID. + local aws_route_table_id + aws_route_table_id=$(aws ec2 describe-route-tables --filters "Name=vpc-id,Values=${aws_vpc_id}" --region "${AWS_REGION}" --query '*[0].RouteTableId' --output text) + + # Create security group. + echo "Create security group with name ${AWS_NETWORK_NAME}" + aws ec2 create-security-group --group-name "${AWS_NETWORK_NAME}" --description "${AWS_NETWORK_NAME}" --vpc-id "${aws_vpc_id}" --tag-specifications "ResourceType=security-group,Tags=[{Key=Name,Value=${AWS_NETWORK_NAME}}]" --region="${AWS_REGION}" + # Get security group ID. + local aws_security_group_id + aws_security_group_id=$(aws ec2 describe-security-groups --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region "${AWS_REGION}" --query '*[0].GroupId' --output text) + # Allow port 22 for ssh. + echo "Allow ingress on port 22 on security group with name ${AWS_NETWORK_NAME}" + aws ec2 authorize-security-group-ingress --group-id "${aws_security_group_id}" --protocol tcp --port 22 --cidr 0.0.0.0/0 --region="${AWS_REGION}" + + # Create internet gateway. + # Documentation to enable internet access for subnet: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/TroubleshootingInstancesConnecting.html#TroubleshootingInstancesConnectionTimeout + echo "Create internet gateway with name ${AWS_NETWORK_NAME}" + aws ec2 create-internet-gateway --tag-specifications "ResourceType=internet-gateway,Tags=[{Key=Name,Value=${AWS_NETWORK_NAME}}]" --region="${AWS_REGION}" + # Get internet gateway ID. + local aws_internet_gateway_id + aws_internet_gateway_id=$(aws ec2 describe-internet-gateways --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region "${AWS_REGION}" --query '*[0].InternetGatewayId' --output text) + # Attach internet gateway to VPC. + echo "Attach internet gateway with name ${AWS_NETWORK_NAME} to VPC" + aws ec2 attach-internet-gateway --internet-gateway-id "${aws_internet_gateway_id}" --vpc-id "${aws_vpc_id}" --region="${AWS_REGION}" + # Create routes for internet egress traffic. + echo "Create routes for IPv4 and IPv6 egress internet traffic" + aws ec2 create-route --route-table-id "${aws_route_table_id}" --destination-cidr-block 0.0.0.0/0 --gateway-id "${aws_internet_gateway_id}" --region "${AWS_REGION}" + aws ec2 create-route --route-table-id "${aws_route_table_id}" --destination-ipv6-cidr-block ::/0 --gateway-id "${aws_internet_gateway_id}" --region "${AWS_REGION}" + else + echo "There is already a VPC with name ${AWS_NETWORK_NAME}. Skipping creation of VPC and corresponding objects." + fi + else + echo "Nothing to do for default VPC." + fi + + echo "" +} + +# create_server creates a server with a Docker engine. +# Example: create_server $server_name $server_private_ip +function create_server { + echo -e "# Create Server \n" + + local server_name=$1 && shift + local server_private_ip=$1 && shift + + # Template user data for cloud-init. + local userdata_file + userdata_file=$(template_cloud_init_file "${server_private_ip}" "${AWS_KEY_PAIR_PUBLIC_KEY_FILE}") + + # Create the server if there is no running server with the same name. + if [[ $(aws ec2 describe-instances --filters Name=tag:Name,Values="${server_name}" --filters Name=instance-state-name,Values=running --region="${AWS_REGION}" --query 'length(*[0])') = "0" ]]; + then + local aws_subnet_id + aws_subnet_id=$(aws ec2 describe-subnets --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region "${AWS_REGION}" --query '*[0].SubnetId' --output text) + local aws_security_group_id + aws_security_group_id=$(aws ec2 describe-security-groups --filters Name=tag:Name,Values="${AWS_NETWORK_NAME}" --region "${AWS_REGION}" --query '*[0].GroupId' --output text) + + echo "Create server with name ${server_name}" + # Note: /dev/sda1 is renamed to /dev/nvme0n1 by AWS + aws ec2 run-instances --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=${server_name}}]" \ + --region "${AWS_REGION}" \ + --placement "AvailabilityZone=${AWS_ZONE}" \ + --image-id "${AWS_AMI}" \ + --instance-type "${AWS_MACHINE_TYPE}" \ + --block-device-mappings 'DeviceName=/dev/sda1,Ebs={VolumeSize=100}' \ + --subnet-id "${aws_subnet_id}" \ + --private-ip-address "${server_private_ip}" \ + --count 1 \ + --associate-public-ip-address \ + --security-group-ids "${aws_security_group_id}" \ + --key-name "${AWS_KEY_PAIR}" \ + --user-data "file://${userdata_file}" \ + --no-paginate + + echo "Wait until server has a public IP." + # shellcheck disable=SC2046 + retry 3 10 [ ! -z $(aws ec2 describe-instances \ + --filters "Name=tag:Name,Values=${server_name}" \ + --region "${AWS_REGION}" \ + --query 'Reservations[*].Instances[*].PublicIpAddress' \ + --output text) ] + else + echo "There is already a running server with name ${server_name}. Skipping server creation." + fi + + echo "" +} + +# cleanup stops sshuttle and exits. +# Example: cleanup +function cleanup { + stop_sshuttle + exit 0 +} + +function main() { + if [ "${1:-}" == "cleanup" ]; then + cleanup + fi + + if [[ -n "${SKIP_INIT_INFRA:-}" ]]; then + echo "Skipping infrastructure initialization." + else + create_infrastructure + fi + + # Create server with a Docker engine. + create_server "${SERVER_NAME}" "${SERVER_PRIVATE_IP}" + server_public_ip=$(aws ec2 describe-instances \ + --filters "Name=tag:Name,Values=${SERVER_NAME}" \ + --region "${AWS_REGION}" \ + --query 'Reservations[*].Instances[*].PublicIpAddress' \ + --output text) + + echo -e "# Server running: public ip: ${server_public_ip}, private ip: ${SERVER_PRIVATE_IP}\n" + + # Open the tunnel. + start_sshuttle "${server_public_ip}" "${AWS_NETWORK_CIDR}" "${SERVER_KIND_SUBNET}" "${AWS_SSH_CMD}" + + # Wait for cloud-init to complete. + # Note: As we already opened the tunnel we can access the server with its private ip. + wait_for_cloud_init "${SERVER_PRIVATE_IP}" "${AWS_SSH_CMD}" + + # Wait until the docker engine is available. + echo -e "Wait until Docker is available\n" + export DOCKER_HOST=tcp://10.0.3.15:2375 + retry 5 30 "docker version" + echo "" + + echo "Docker now available. Set DOCKER_HOST=${DOCKER_HOST} to use it with the Docker CLI." +} + +main "$@" diff --git a/hack/remote/setup-docker-on-gce-project.sh b/hack/remote/setup-docker-on-gce-project.sh new file mode 100755 index 000000000000..d464c928d1be --- /dev/null +++ b/hack/remote/setup-docker-on-gce-project.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +# shellcheck source=./hack/remote/setup-docker-lib.sh +source "${REPO_ROOT}/hack/remote/setup-docker-lib.sh" + +# Set key files +# Note: These will be later used to connect to the Docker host. +SSH_PUBLIC_KEY_FILE=${SSH_PUBLIC_KEY_FILE:-"/root/.ssh/google_compute_engine.pub"} +SSH_PRIVATE_KEY_FILE=${SSH_PRIVATE_KEY_FILE:-"/root/.ssh/google_compute_engine"} + + +function cloud_init { + GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS:-""} + GCP_PROJECT=${GCP_PROJECT:-""} + GCP_REGION=${GCP_REGION:-"us-east4"} + GCP_MACHINE_MIN_CPU_PLATFORM=${GCP_MACHINE_MIN_CPU_PLATFORM:-"Intel Cascade Lake"} + GCP_NETWORK_NAME=${GCP_NETWORK_NAME:-"${CLUSTER_NAME}-mynetwork"} + + # We have a quota of 24 vCPUs + GCP_MACHINE_TYPE=${GCP_MACHINE_TYPE:-"n2-standard-16"} + + echo "Using: GCP_PROJECT: ${GCP_PROJECT} GCP_REGION: ${GCP_REGION} GCP_NETWORK_NAME: ${GCP_NETWORK_NAME}" + + # Generate local ssh configuration + # NOTE(mdbooth): This command successfully populates ssh config and then + # fails for some reason I don't understand. We ignore the failure. + gcloud compute config-ssh || true +} + +function init_infrastructure() { + if [[ ${GCP_NETWORK_NAME} != "default" ]]; then + if ! gcloud compute networks describe "$GCP_NETWORK_NAME" --project "$GCP_PROJECT" >/dev/null; then + gcloud compute networks create --project "$GCP_PROJECT" "$GCP_NETWORK_NAME" --subnet-mode custom + gcloud compute networks subnets create "$GCP_NETWORK_NAME" --project "$GCP_PROJECT" \ + --network="$GCP_NETWORK_NAME" --range="$PRIVATE_NETWORK_CIDR" --region "$GCP_REGION" + + gcloud compute firewall-rules create "${GCP_NETWORK_NAME}-allow-http" --project "$GCP_PROJECT" \ + --allow tcp:80 --direction=INGRESS --network "$GCP_NETWORK_NAME" --quiet + # As of Victoria, neutron is the only service which isn't multiplexed by + # apached on port 80 + gcloud compute firewall-rules create "${GCP_NETWORK_NAME}-allow-neutron" --project "$GCP_PROJECT" \ + --allow tcp:9696 --direction=INGRESS --network "$GCP_NETWORK_NAME" --quiet + gcloud compute firewall-rules create "${GCP_NETWORK_NAME}-allow-icmp" --project "$GCP_PROJECT" \ + --allow icmp --direction=INGRESS --network "$GCP_NETWORK_NAME" --priority 65534 --quiet + gcloud compute firewall-rules create "${GCP_NETWORK_NAME}-allow-ssh" --project "$GCP_PROJECT" \ + --allow "tcp:22" --direction=INGRESS --network "$GCP_NETWORK_NAME" --priority 65534 --quiet + gcloud compute firewall-rules create "${GCP_NETWORK_NAME}-allow-internal" --project "$GCP_PROJECT" \ + --allow "tcp:0-65535,udp:0-65535,icmp" --source-ranges="$PRIVATE_NETWORK_CIDR" \ + --direction=INGRESS --network "$GCP_NETWORK_NAME" --priority 65534 --quiet + fi + fi + + gcloud compute firewall-rules list --project "$GCP_PROJECT" + gcloud compute networks list --project="$GCP_PROJECT" + gcloud compute networks describe "$GCP_NETWORK_NAME" --project="$GCP_PROJECT" + + if ! gcloud compute routers describe "${CLUSTER_NAME}-myrouter" --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null; then + gcloud compute routers create "${CLUSTER_NAME}-myrouter" --project="$GCP_PROJECT" \ + --region="$GCP_REGION" --network="$GCP_NETWORK_NAME" + fi + if ! gcloud compute routers nats describe --router="$CLUSTER_NAME-myrouter" "$CLUSTER_NAME-mynat" \ + --project="$GCP_PROJECT" --region="${GCP_REGION}" >/dev/null; then + gcloud compute routers nats create "${CLUSTER_NAME}-mynat" --project="$GCP_PROJECT" \ + --router-region="$GCP_REGION" --router="${CLUSTER_NAME}-myrouter" \ + --nat-all-subnet-ip-ranges --auto-allocate-nat-external-ips + fi +} + +function create_vm { + local ip=$1 && shift + local userdata=$1 && shift + + local machine_type="GCP_MACHINE_TYPE" + machine_type=${!machine_type} + local servername="${CLUSTER_NAME}" + local diskname="${CLUSTER_NAME}-disk" + local imagename="${servername}-image" + + # Loop over all zones in the GCP region to ignore a full zone. + # We are not able to use 'gcloud compute zones list' as the gcloud.compute.zones.list permission is missing. + for GCP_ZONE in "${GCP_REGION}-a" "${GCP_REGION}-b" "${GCP_REGION}-c"; do + # Check if image was already created. + # Images are not zone specific, but the disk is. + if ! gcloud compute images describe "$imagename" --project "$GCP_PROJECT" >/dev/null; then + # Create the base disk image based on the public Ubuntu 20.04 LTS cloud image + # Note that this has also been verified to work with CentOS 8 as of + # 2021-01-12, but this is not tested regularly. + # To use CentOS 8: + # --image-project centos-cloud --image-family centos-stream-8 + if ! gcloud compute disks describe "$diskname" --project "$GCP_PROJECT" --zone "$GCP_ZONE" >/dev/null; then + gcloud compute disks create "$diskname" \ + --project "$GCP_PROJECT" \ + --image-project ubuntu-os-cloud --image-family ubuntu-2004-lts \ + --zone "$GCP_ZONE" + fi + gcloud compute images create "$imagename" \ + --project "$GCP_PROJECT" \ + --source-disk "$diskname" --source-disk-zone "$GCP_ZONE" \ + --licenses "https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" + fi + + if ! gcloud compute instances describe "$servername" --project "$GCP_PROJECT" --zone "$GCP_ZONE" >/dev/null; then + if gcloud compute instances create "$servername" \ + --project "$GCP_PROJECT" \ + --zone "$GCP_ZONE" \ + --image "$imagename" \ + --boot-disk-size 200G \ + --boot-disk-type pd-ssd \ + --can-ip-forward \ + --tags http-server,https-server,novnc,openstack-apis \ + --min-cpu-platform "$GCP_MACHINE_MIN_CPU_PLATFORM" \ + --machine-type "$machine_type" \ + --network-interface="private-network-ip=${ip},network=${CLUSTER_NAME}-mynetwork,subnet=${CLUSTER_NAME}-mynetwork" \ + --metadata-from-file user-data="$userdata"; then + # return function create_vm if the instance have been created successfully. + return + fi + fi + done + echo "No free GCP zone could be found to create instance $servername." + exit 1 +} + +function get_public_ip { + local ip + while ! ip=$(gcloud compute instances describe "${CLUSTER_NAME}" \ + --project "$GCP_PROJECT" --zone "$GCP_ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)'); do + echo "Waiting for a public IP" + sleep 5 + done + echo "$ip" +} diff --git a/scripts/ci-e2e-scale.sh b/scripts/ci-e2e-scale.sh new file mode 100755 index 000000000000..4468d7c73c3a --- /dev/null +++ b/scripts/ci-e2e-scale.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "${REPO_ROOT}" || exit 1 + +# shellcheck source=./scripts/ci-e2e-lib.sh +source "${REPO_ROOT}/scripts/ci-e2e-lib.sh" + +# shellcheck source=./hack/ensure-go.sh +source "${REPO_ROOT}/hack/ensure-go.sh" +# shellcheck source=./hack/ensure-kubectl.sh +source "${REPO_ROOT}/hack/ensure-kubectl.sh" +# shellcheck source=./hack/ensure-kind.sh +source "${REPO_ROOT}/hack/ensure-kind.sh" + +# Make sure the tools binaries are on the path. +export PATH="${REPO_ROOT}/hack/tools/bin:${PATH}" + +# Build envsubst. This is later required to template the cloud-init file. +make envsubst + +# Builds CAPI (and CAPD) images. +capi:buildDockerImages + +# Prepare kindest/node images for all the required Kubernetes version; this implies +# 1. Kubernetes version labels (e.g. latest) to the corresponding version numbers. +# 2. Pre-pulling the corresponding kindest/node image if available; if not, building the image locally. +# Following variables are currently checked (if defined): +# - KUBERNETES_VERSION +# - KUBERNETES_VERSION_UPGRADE_TO +# - KUBERNETES_VERSION_UPGRADE_FROM +k8s::prepareKindestImages +# FIXME(sbueringer): This is only useful if we import them in the remote docker engine +# TBD if we want to use any local docker host or run everything remote + +# pre-pull all the images that will be used in the e2e, thus making the actual test run +# less sensible to the network speed. This includes: +# - cert-manager images +kind:prepullAdditionalImages + +# RESOURCE_TYPE defines in which cloud we run the e2e test. +# FIXME(sbueringer): as of today boskos.py only supports "gce-project". +# "aws-account" could be supported by extending boskos.py (cf. with boskos.py in the CAPA repo). +export RESOURCE_TYPE="${RESOURCE_TYPE:-"gce-project"}" + +# Configure e2e tests +export GINKGO_NODES=3 +export GINKGO_NOCOLOR=true +export GINKGO_ARGS="--fail-fast" # Other ginkgo args that need to be appended to the command. +export E2E_CONF_FILE="${REPO_ROOT}/test/e2e/config/docker.yaml" +export ARTIFACTS="${ARTIFACTS:-${REPO_ROOT}/_artifacts}" +export SKIP_RESOURCE_CLEANUP=false +export USE_EXISTING_CLUSTER=false + +# Setup local output directory +ARTIFACTS_LOCAL="${ARTIFACTS}/localhost" +mkdir -p "${ARTIFACTS_LOCAL}" +echo "This folder contains logs from the local host where the tests ran." > "${ARTIFACTS_LOCAL}/README.md" + +# Configure the containerd socket, otherwise 'ctr' would not work +export CONTAINERD_ADDRESS=/var/run/docker/containerd/containerd.sock + +# ensure we retrieve additional info for debugging when we leave the script +cleanup() { + # shellcheck disable=SC2046 + kill $(pgrep -f 'docker events') || true + # shellcheck disable=SC2046 + kill $(pgrep -f 'ctr -n moby events') || true + + cp /var/log/docker.log "${ARTIFACTS_LOCAL}/docker.log" || true + docker ps -a > "${ARTIFACTS_LOCAL}/docker-ps.txt" || true + docker images > "${ARTIFACTS_LOCAL}/docker-images.txt" || true + docker info > "${ARTIFACTS_LOCAL}/docker-info.txt" || true + docker system df > "${ARTIFACTS_LOCAL}/docker-system-df.txt" || true + docker version > "${ARTIFACTS_LOCAL}/docker-version.txt" || true + + ctr namespaces list > "${ARTIFACTS_LOCAL}/containerd-namespaces.txt" || true + ctr -n moby tasks list > "${ARTIFACTS_LOCAL}/containerd-tasks.txt" || true + ctr -n moby containers list > "${ARTIFACTS_LOCAL}/containerd-containers.txt" || true + ctr -n moby images list > "${ARTIFACTS_LOCAL}/containerd-images.txt" || true + ctr -n moby version > "${ARTIFACTS_LOCAL}/containerd-version.txt" || true + + # Stop boskos heartbeat + [[ -z ${HEART_BEAT_PID:-} ]] || kill -9 "${HEART_BEAT_PID}" + + # Stop sshuttle which was used to tunnel to the remote Docker host. + pkill sshuttle + + # Verify that no containers are running at this time + # Note: This verifies that all our tests clean up clusters correctly. + if [[ ! "$(docker ps -q | wc -l)" -eq "0" ]] + then + echo "ERROR: Found unexpected running containers:" + echo "" + docker ps + exit 1 + fi +} +trap "cleanup" EXIT SIGINT + +# Stream docker and containerd events. +docker events > "${ARTIFACTS_LOCAL}/docker-events.txt" 2>&1 & +ctr -n moby events > "${ARTIFACTS_LOCAL}/containerd-events.txt" 2>&1 & + +# Ensure that python3-pip is installed. +apt update +apt install -y python3-pip +rm -rf /var/lib/apt/lists/* + +# Install/upgrade pip and requests module explicitly for HTTP calls. +python3 -m pip install --upgrade pip requests + +# If BOSKOS_HOST is set then acquire a resource of type ${RESOURCE_TYPE} from Boskos. +if [ -n "${BOSKOS_HOST:-}" ]; then + # Check out the account from Boskos and store the produced environment + # variables in a temporary file. + account_env_var_file="$(mktemp)" + python3 hack/boskos.py --get --resource-type="${RESOURCE_TYPE}" 1>"${account_env_var_file}" + checkout_account_status="${?}" + + # If the checkout process was a success then load the account's + # environment variables into this process. + # shellcheck disable=SC1090 + [ "${checkout_account_status}" = "0" ] && . "${account_env_var_file}" + + # Always remove the account environment variable file. It contains + # sensitive information. + rm -f "${account_env_var_file}" + + if [ ! "${checkout_account_status}" = "0" ]; then + echo "error getting account from boskos" 1>&2 + exit "${checkout_account_status}" + fi + + # run the heart beat process to tell boskos that we are still + # using the checked out account periodically + python3 -u hack/boskos.py --heartbeat >> "$ARTIFACTS/logs/boskos.log" 2>&1 & + HEART_BEAT_PID=$! +fi + +"hack/remote/setup-docker-on-${RESOURCE_TYPE}.sh" + +# Use remote Docker host in CAPD. +export CAPD_DOCKER_HOST=tcp://10.0.3.15:2375 + +make test-e2e + +test_status="${?}" + +cleanup + +# If Boskos is being used then release the resource back to Boskos. +[ -z "${BOSKOS_HOST:-}" ] || python3 hack/boskos.py --release >> "$ARTIFACTS/logs/boskos.log" 2>&1 + +exit "${test_status}" diff --git a/scripts/ci-e2e.sh b/scripts/ci-e2e.sh index 6f5f632a8758..421e7a17345a 100755 --- a/scripts/ci-e2e.sh +++ b/scripts/ci-e2e.sh @@ -15,6 +15,7 @@ # limitations under the License. set -o errexit +set -o nounset set -o pipefail @@ -110,6 +111,7 @@ cleanup() { } trap "cleanup" EXIT SIGINT +# Stream docker and containerd events. docker events > "${ARTIFACTS_LOCAL}/docker-events.txt" 2>&1 & ctr -n moby events > "${ARTIFACTS_LOCAL}/containerd-events.txt" 2>&1 &