Skip to content

Commit 3bb2d2c

Browse files
authored
Add Calico to Cilium migration guide (#461)
1 parent a7ec25a commit 3bb2d2c

File tree

14 files changed

+864
-0
lines changed

14 files changed

+864
-0
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env bash
2+
3+
set -eu
4+
5+
here="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
6+
# shellcheck source=migration/calico-to-cilium/common.sh
7+
source "${here}/common.sh"
8+
9+
check_bin kubectl
10+
check_bin cilium-cli
11+
check_bin kubectl-evict
12+
13+
NODE="${1:-}"
14+
if [[ -z "${NODE}" ]]; then
15+
log_error "FATAL: need a node name" >&2
16+
exit 1
17+
fi
18+
NODE_HASH="$(hash_node "${NODE}")"
19+
20+
CALICO_TO_CILIUM=true
21+
EVICT_PODS_WITH_IP_PREFIX="${CALICO_IP_PREFIX}"
22+
if [[ "${2:-}" == "--rollback" ]]; then
23+
CALICO_TO_CILIUM=false
24+
EVICT_PODS_WITH_IP_PREFIX="${CILIUM_IP_PREFIX}"
25+
fi
26+
27+
log_info "Starging migration for node $(yellow_text "${NODE}") with hash $(yellow_text "${NODE_HASH}")"
28+
29+
deferred_cleanup() {
30+
# Remove the 'skip-taint' label
31+
unlabel_node "${NODE}" "skip-taint"
32+
33+
# Remove the temporary taints from other nodes
34+
log_info "Removing temporary taint from all nodes"
35+
untaint_nodes "cilium-guard-${NODE_HASH}" "$(get_all_nodes)"
36+
}
37+
trap deferred_cleanup EXIT
38+
39+
# Mark the node to allow cilium per-node configuration + skip from tainting
40+
label_node "${NODE}" "cilium-default"
41+
label_node "${NODE}" "skip-taint"
42+
43+
if ${CALICO_TO_CILIUM}; then
44+
# Cycle the cilium pod of the node to trigger CNI re-configuration
45+
log_info "Cycling cilium pod on $(yellow_text "${NODE}")"
46+
kubectl -n kube-system delete pod --field-selector spec.nodeName="${NODE}" -l k8s-app=cilium
47+
kubectl -n kube-system rollout status daemonset/cilium --watch
48+
49+
# Check the node connectivity
50+
if ! check_node_connectivity "${NODE}" "${NODE_HASH}"; then
51+
log_error "FATAL: could not confirm network connectivity for node ${NODE}"
52+
exit 1
53+
fi
54+
fi
55+
56+
# Add a temporary taint to all OTHER nodes (prevents scheduling elsewhere)
57+
log_info "Tainting all nodes not labeled with $(yellow_text "${LABEL_PREFIX}/skip-taint")"
58+
taint_nodes "cilium-guard-${NODE_HASH}" "$(get_unlabeled_nodes "skip-taint")"
59+
60+
# Evict node pods managed by Calico one by one, with retries, thus respecting PDBs
61+
# -> since we tainted every other node, they should be scheduling on the node we're processing
62+
get_node_pods_with_ip_prefix "${NODE}" "${EVICT_PODS_WITH_IP_PREFIX}" | xargs "${here}/evict_queue.py"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env bash
2+
3+
set -eu
4+
5+
here="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
6+
# shellcheck source=migration/calico-to-cilium/common.sh
7+
source "${here}/common.sh"
8+
9+
CONFIG_DIR="${CK8S_CONFIG_PATH}/${TARGET_CLUSTER}-config/group_vars/k8s_cluster"
10+
11+
patch_yaml_config() {
12+
local -r patch_file="${1}"
13+
local -r config_file="${2}"
14+
15+
log_info "Patching $(yellow_text "${CONFIG_DIR}/${config_file}") with $(yellow_text "${here}/${patch_file}")"
16+
17+
# shellcheck disable=SC2016
18+
yq eval-all '. as $item ireduce ({}; . * $item)' "${here}/${patch_file}" "${CONFIG_DIR}/${config_file}" >"${CONFIG_DIR}/${config_file}.new"
19+
mv -f "${CONFIG_DIR}/${config_file}.new" "${CONFIG_DIR}/${config_file}"
20+
}
21+
22+
enable_monitoring() {
23+
local -r config="${CONFIG_DIR}/ck8s-cilium.yaml"
24+
25+
log_info "Enabling service monitors for Cilium"
26+
27+
yq -i '.ck8s_cilium.operator.monitoring.installServiceMonitor = true' "${config}"
28+
yq -i '.ck8s_cilium.hubble.monitoring.installServiceMonitor = true' "${config}"
29+
yq -i '.ck8s_cilium.prometheus.installServiceMonitor = true' "${config}"
30+
}
31+
32+
patch_yaml_config k8s-cluster-config/enable-cilium.yaml ck8s-k8s-cluster.yaml
33+
if kubectl get crd servicemonitors.monitoring.coreos.com >/dev/null; then
34+
enable_monitoring
35+
fi
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env bash
2+
3+
kubectl delete -n kube-system daemonset calico-node --wait --ignore-not-found
4+
kubectl delete -n kube-system daemonset calico-accountant --wait --ignore-not-found
5+
kubectl delete -n kube-system deployment calico-kube-controllers --wait --ignore-not-found
6+
7+
mapfile -t CALICO_CRDS < <(kubectl api-resources --api-group=crd.projectcalico.org -o name)
8+
if [[ "${#CALICO_CRDS[@]}" -gt 0 ]]; then
9+
kubectl delete crds "${CALICO_CRDS[@]}"
10+
fi
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Migrating Kubespray clusters from Calico to Cilium CNI
2+
3+
> [!NOTE]
4+
> Clusters will be migrated one at a time depending on the environment variable `TARGET_CLUSTER`.
5+
>
6+
> It does not matter which cluster is migrated first, but it is recommended to start the migration with the _service_ cluster.
7+
>
8+
> It's also worth mentioning that full network connectivity is maintained between the Calico and Cilium subnets during the migration.
9+
>
10+
> For reference, it takes about 5 minutes to complete the disruptive parts of this guide, on a cluster with 5 nodes.
11+
12+
> [!IMPORTANT]
13+
> This guide assumes all commands are run from the `migration/calico-to-cilium` directory of the `compliantkubernetes-kubespray` repository.
14+
15+
> [!IMPORTANT]
16+
> The Cilium pod subnet is preconfigured to `10.235.64.0/18`, in accordance with the recommended value from the official documentation.
17+
>
18+
> This should prevent any overlaps with the Calico subnet, assumed to have the `10.233.0.0/16` prefix.
19+
20+
## Prerequisites
21+
22+
The migration uses the Cilium CLI for status checks, as well as the `evict` plugin for `kubectl`.
23+
24+
You will need to install the following on your system:
25+
26+
### Golang
27+
28+
On Ubuntu: `sudo apt install golang-go`
29+
30+
### The Cilium CLI
31+
32+
Grab the binary from the [GitHub releases page](https://github.com/cilium/cilium-cli/releases) and put it somewhere in your `PATH`.
33+
34+
To have it installed under `${HOME}/.local/bin`:
35+
36+
```shell
37+
mkdir -p "${HOME}/.local/bin"
38+
curl -fsSL -o- https://github.com/cilium/cilium-cli/releases/download/v0.18.7/cilium-linux-amd64.tar.gz | tar -zxv -C "${HOME}/.local/bin"
39+
mv "${HOME}/.local/bin/cilium" "${HOME}/.local/bin/cilium-cli"
40+
```
41+
42+
> [!NOTE]
43+
> This assumes that the `${HOME}/.local/bin` directory is within your `PATH`. If that's not the case:
44+
> `export PATH="$PATH:$HOME/.local/bin"`
45+
46+
> [!IMPORTANT]
47+
> The migration scripts assume the executable name for the Cilium CLI is `cilium-cli` and _NOT_ `cilium`.
48+
49+
### The `evict` plugin for `kubectl`
50+
51+
```shell
52+
go install github.com/ueokande/kubectl-evict@latest
53+
```
54+
55+
## Prepare
56+
57+
These steps can be performed without any disruption to the target cluster.
58+
59+
- Prepare environment variables:
60+
61+
```bash
62+
export TARGET_CLUSTER="<sc|wc>"
63+
export CK8S_CONFIG_PATH="/path/to/cluster/config"
64+
export KUBECONFIG="${CK8S_CONFIG_PATH}/.state/kube_config_${TARGET_CLUSTER}.yaml"
65+
```
66+
67+
- This guide includes a complete Kubespray run for the target cluster. For OpenStack clusters, credentials must be sourced:
68+
69+
```bash
70+
test -f ${CK8S_CONFIG_PATH}/openrc.sh && source ${CK8S_CONFIG_PATH}/openrc.sh
71+
test -f ${CK8S_CONFIG_PATH}/secret/openstack-app-credentials-for-kubespray.sh && source <(sops -d ${CK8S_CONFIG_PATH}/secret/openstack-app-credentials-for-kubespray.sh)
72+
```
73+
74+
- Ensure that the checked out tag or commit in your Kubespray repository matches the version in the cluster:
75+
76+
```bash
77+
KUBESPRAY_REF="$(yq '.ck8sKubesprayVersion' ${CK8S_CONFIG_PATH}/${TARGET_CLUSTER}-config/group_vars/all/ck8s-kubespray-general.yaml)"
78+
git switch --detach "${KUBESPRAY_REF}"
79+
80+
# update the kubespray submodule if needed
81+
git submodule sync
82+
git submodule update --init --recursive
83+
```
84+
85+
- Switch `kube_owner` to `root` and apply the changes:
86+
87+
```bash
88+
yq -i '.kube_owner = "root"' "${CK8S_CONFIG_PATH}/${TARGET_CLUSTER}-config/group_vars/k8s_cluster/ck8s-k8s-cluster.yaml"
89+
../../bin/ck8s-kubespray apply $TARGET_CLUSTER -b -e=ignore_assert_errors=true --skip-tags=multus
90+
```
91+
92+
- Install Cilium using the values provided in the `cilium-chart-values` directory and wait for the `DaemonSet` rollout:
93+
94+
```bash
95+
cilium-cli install --version 1.17.5 -f cilium-chart-values/cilium-values.yaml -f cilium-chart-values/cilium-extra.yaml
96+
kubectl -n kube-system rollout status daemonset/cilium --watch
97+
```
98+
99+
- Enable the [Per-node configuration](https://docs.cilium.io/en/v1.17/configuration/per-node-config/) feature:
100+
101+
```bash
102+
kubectl apply -f cilium-node-config/during-migration.yaml
103+
```
104+
105+
## Execute
106+
107+
These steps will cause disruption in the target cluster.
108+
109+
### 1. Temporarily allow all traffic through Calico
110+
111+
```bash
112+
kubectl apply -f policies/calico-allow-all.yaml
113+
```
114+
115+
### 2. Migrate worker nodes
116+
117+
Get the list of worker nodes and migrate them one by one, passing the node name as argument to the `./20-migrate-node.sh` script.
118+
119+
For example:
120+
121+
```bash
122+
kubectl get nodes --no-headers -o custom-columns=":metadata.name" |
123+
grep -v 'control-plane' |
124+
xargs -rt -I{} --interactive ./20-migrate-node.sh {}
125+
```
126+
127+
> [!TIP]
128+
> To skip confirmation prompts for each node, remove the `--interactive` flag from `xargs`.
129+
130+
### 3. Migrate control plane nodes
131+
132+
Get the list of control plane nodes and migrate them one by one, passing the node name as argument to the `./20-migrate-node.sh` script.
133+
134+
For example:
135+
136+
```bash
137+
kubectl get nodes --no-headers -o custom-columns=":metadata.name" |
138+
grep 'control-plane' |
139+
xargs -rt -I{} --interactive ./20-migrate-node.sh {}
140+
```
141+
142+
### 4. Switch the Kubespray configuration to Cilium
143+
144+
```bash
145+
./80-switch-to-cilium.sh
146+
../../bin/ck8s-kubespray apply $TARGET_CLUSTER -b -e=ignore_assert_errors=true --tags="download,network"
147+
```
148+
149+
### 5. Cleanup
150+
151+
- Remove the per-node Cilium configuration:
152+
153+
```bash
154+
kubectl -n kube-system delete ciliumnodeconfigs.cilium.io cilium-default
155+
```
156+
157+
- Remove Calico remnants:
158+
159+
```bash
160+
./90-cleanup-calico.sh
161+
```
162+
163+
### 6. (Optional) Reconfigure Apps
164+
165+
If Welkin Apps has been deployed in the environment, it will require a reconfiguration step:
166+
167+
```bash
168+
export CK8S_APPS_REPOSITORY_PATH=/path/to/welkin-apps
169+
170+
yq -i '.networkPlugin.type = "cilium"' "${CK8S_CONFIG_PATH}/common-config.yaml"
171+
yq -i '.networkPlugin.calico.calicoAccountant.enabled = false' "${CK8S_CONFIG_PATH}/common-config.yaml"
172+
yq -i '.networkPlugin.calico.calicoFelixMetrics.enabled = false' "${CK8S_CONFIG_PATH}/common-config.yaml"
173+
174+
${CK8S_APPS_REPOSITORY_PATH}/bin/update-ips.bash both dry-run
175+
${CK8S_APPS_REPOSITORY_PATH}/bin/update-ips.bash both apply
176+
177+
${CK8S_APPS_REPOSITORY_PATH}/bin/ck8s apply sc --concurrency=$(nproc)
178+
${CK8S_APPS_REPOSITORY_PATH}/bin/ck8s apply wc --concurrency=$(nproc)
179+
```
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
annotateK8sNode: true
2+
encryption:
3+
enabled: true
4+
strictMode:
5+
enabled: false
6+
type: wireguard
7+
envoy:
8+
enabled: false
9+
hubble:
10+
enabled: true
11+
metrics:
12+
enabled:
13+
- drop:labelsContext=traffic_direction,source_pod,source_namespace,source_ip,destination_pod,destination_namespace,destination_ip
14+
- flow:labelsContext=traffic_direction,source_pod,source_namespace,source_ip,destination_pod,destination_namespace,destination_ip
15+
- dns
16+
- tcp
17+
- icmp
18+
- httpV2
19+
servicejonitor:
20+
enabled: true
21+
operator:
22+
prometheus:
23+
enabled: true
24+
serviceMonitor:
25+
enabled: true
26+
unmanagedPodWatcher:
27+
restart: false
28+
policyAuditMode: false
29+
policyCIDRMatchMode: nodes
30+
policyEnforcementMode: never
31+
prometheus:
32+
enabled: true
33+
serviceMonitor:
34+
enabled: true
35+
trustCRDsExist: true

0 commit comments

Comments
 (0)