Skip to content

Commit 1023be0

Browse files
authored
NO-JIRA: chore(ci): extract k8s setup and disk space cleanup into a composite action each (opendatahub-io#2538)
1 parent f9745f8 commit 1023be0

File tree

4 files changed

+215
-179
lines changed

4 files changed

+215
-179
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
name: 'Free up disk space'
3+
description: 'Removes unnecessary packages and files to free up disk space on GitHub runners'
4+
runs:
5+
using: "composite"
6+
steps:
7+
- name: Free up additional disk space
8+
shell: bash
9+
run: |
10+
set -x
11+
df -h
12+
sudo apt-get update
13+
sudo apt-get purge -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*'
14+
sudo apt-get autoremove -y --purge
15+
sudo apt-get clean
16+
sudo rm -rf /usr/local/.ghcup &
17+
sudo rm -rf /usr/local/lib/android &
18+
sudo rm -rf /usr/local/share/boost &
19+
sudo rm -rf /usr/local/lib/node_modules &
20+
sudo rm -rf /usr/share/dotnet &
21+
sudo rm -rf /opt/ghc &
22+
sudo rm -rf /opt/hostedtoolcache/CodeQL &
23+
sudo docker image prune --all --force &
24+
wait
25+
df -h
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
---
2+
name: 'Provision K8s Cluster'
3+
description: 'Installs cri-o and provisions a single-node Kubernetes cluster using kubeadm'
4+
runs:
5+
using: "composite"
6+
steps:
7+
- name: Install cri-o
8+
id: install-crio
9+
shell: bash
10+
run: |
11+
set -Eeuxo pipefail
12+
13+
# the Microsoft repo's kubelet does not provide /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
14+
# [Service]
15+
# EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
16+
# ExecStart=/usr/bin/kubelet $KUBELET_KUBEADM_ARGS
17+
sudo ls /etc/apt/sources.list.d/
18+
sudo rm /etc/apt/sources.list.d/microsoft-prod.list
19+
20+
sudo apt-get update
21+
sudo apt-get install -y software-properties-common curl
22+
23+
# https://github.com/cri-o/packaging?tab=readme-ov-file#distributions-using-deb-packages
24+
25+
curl -fsSL https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/Release.key | \
26+
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
27+
28+
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/ /" | \
29+
sudo tee /etc/apt/sources.list.d/kubernetes.list
30+
31+
curl -fsSL https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/Release.key | \
32+
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
33+
34+
echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/ /" | \
35+
sudo tee /etc/apt/sources.list.d/cri-o.list
36+
37+
sudo apt-get update
38+
39+
# [ERROR FileExisting-conntrack]: conntrack not found in system path
40+
# see man apt-patterns for the ~name=version* syntax
41+
42+
# The following packages will be DOWNGRADED:
43+
# kubectl
44+
# E: Packages were downgraded and -y was used without --allow-downgrades.
45+
46+
sudo apt-get install -y --allow-downgrades \
47+
"cri-o=${CRIO_VERSION}.*" \
48+
"kubelet=${KUBERNETES_VERSION}.*" "kubeadm=${KUBERNETES_VERSION}.*" "kubectl=${KUBERNETES_VERSION}.*" \
49+
conntrack
50+
51+
# make use of /etc/cni/net.d/11-crio-ipv4-bridge.conflist so we don't
52+
# need a pod network and just use the default bridge
53+
sudo rm -rf /etc/cni/net.d/*
54+
# cat /etc/cni/net.d/11-crio-ipv4-bridge.conflist
55+
# https://github.com/containerd/containerd/blob/main/script%2Fsetup%2Finstall-cni
56+
# https://www.cni.dev/plugins/current/main/bridge/
57+
sudo cp ${{ github.action_path }}/../../../ci/cached-builds/11-crio-ipv4-bridge.conflist /etc/cni/net.d/11-crio-ipv4-bridge.conflist
58+
59+
sudo cp ${{ github.action_path }}/../../../ci/cached-builds/crio.conf /etc/crio/crio.conf.d/
60+
61+
sudo systemctl daemon-reload
62+
sudo systemctl start crio.service
63+
env:
64+
# TODO(jdanek): install also "cri-tools=${CRIO_VERSION}.*" when updating to 1.33
65+
CRIO_VERSION: 1.32
66+
# This has to be kept in sync with the packages above, otherwise
67+
# [ERROR KubeletVersion]: the kubelet version is higher than the control plane version.
68+
# This is not a supported version skew and may lead to a malfunctional cluster.
69+
# Kubelet version: "1.33.0" Control plane version: "1.30.12"
70+
KUBERNETES_VERSION: 1.33
71+
# Also update version in kubeadm.yaml
72+
73+
- run: sudo crictl info
74+
shell: bash
75+
76+
- name: Show crio debug data (on failure)
77+
if: ${{ failure() }}
78+
shell: bash
79+
run: |
80+
set -Eeuxo pipefail
81+
82+
sudo systemctl status crio.service || true
83+
sudo journalctl -xeu crio.service
84+
85+
# do this early, it's a good check that cri-o is not completely broken
86+
- name: "Show crio images information"
87+
shell: bash
88+
run: sudo crictl images
89+
90+
- name: Install Kubernetes cluster
91+
shell: bash
92+
run: |
93+
set -Eeuxo pipefail
94+
95+
sudo swapoff -a
96+
sudo modprobe br_netfilter
97+
sudo sysctl -w net.ipv4.ip_forward=1
98+
99+
# Was getting strange DNS resolution errors from pods that don't seem to want to go away sometimes:
100+
# Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Name or service not known.
101+
# wget: unable to resolve host address ‘raw.githubusercontent.com’
102+
# Here's what helped:
103+
# https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#known-issues
104+
# https://github.com/kubernetes/kubernetes/blob/e4c1f980b76fecece30c2f77885a7117192170a6/CHANGELOG/CHANGELOG-1.30.md?plain=1#L1454
105+
# https://github.com/canonical/microk8s/issues/68#issuecomment-404923563
106+
sudo ufw allow in on cni0
107+
sudo ufw allow out on cni0
108+
sudo ufw default allow routed
109+
sudo iptables -P FORWARD ACCEPT
110+
sudo iptables -t nat -A POSTROUTING -s 10.85.0.0/16 -o eth0 -j MASQUERADE
111+
112+
sudo kubeadm reset -f --cri-socket=unix:///var/run/crio/crio.sock
113+
114+
# https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm
115+
sudo kubeadm init --config=${{ github.action_path }}/../../../ci/cached-builds/kubeadm.yaml
116+
117+
mkdir -p $HOME/.kube
118+
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
119+
sudo chown $(id -u):$(id -g) $HOME/.kube/config
120+
121+
- name: Show kubelet debug data (on failure)
122+
if: ${{ failure() }}
123+
shell: bash
124+
run: |
125+
set -Eeuxo pipefail
126+
127+
# [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
128+
sudo cat /var/lib/kubelet/kubeadm-flags.env || true
129+
# [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
130+
sudo cat /var/lib/kubelet/config.yaml || true
131+
132+
sudo systemctl cat kubelet.service || true
133+
134+
sudo cat /etc/systemd/system/kubelet.service.d/10-kubeadm.conf || true
135+
136+
sudo systemctl status kubelet || true
137+
sudo journalctl -xeu kubelet
138+
139+
# Here is one example how you may list all running Kubernetes containers by using crictl:
140+
sudo crictl --runtime-endpoint unix:///var/run/crio/crio.sock ps -a | grep kube | grep -v pause
141+
# Once you have found the failing container, you can inspect its logs with:
142+
# crictl --runtime-endpoint unix:///var/run/crio/crio.sock logs CONTAINERID
143+
144+
- name: Show nodes status and wait for readiness
145+
shell: bash
146+
run: |
147+
kubectl describe nodes
148+
kubectl wait --for=condition=Ready nodes --all --timeout=100s || (kubectl describe nodes && false)
149+
150+
- name: Wait for pods to be running
151+
shell: bash
152+
run: |
153+
set -Eeuxo pipefail
154+
kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
155+
kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
156+
157+
- name: "Install local-path provisioner"
158+
shell: bash
159+
run: |
160+
set -Eeuxo pipefail
161+
kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.31/deploy/local-path-storage.yaml
162+
kubectl wait deployments --all --namespace=local-path-storage --for=condition=Available --timeout=100s
163+
# https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/
164+
kubectl get storageclass
165+
kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

.github/workflows/build-notebooks-TEMPLATE.yaml

Lines changed: 3 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -133,33 +133,12 @@ jobs:
133133
# region Free up disk space
134134

135135
- name: Free up additional disk space
136+
uses: ./.github/actions/free-up-disk-space
136137
# https://docs.github.com/en/actions/learn-github-actions/expressions
137138
# NOTE: the arm64 GitHub hosted runner does not have the /mnt-mounted scratch disk
138139
if: "${{ contains(inputs.target, 'rocm') || contains(inputs.target, 'cuda') ||
139140
contains(inputs.target, 'pytorch') || contains(inputs.target, 'tensorflow') ||
140141
inputs.platform == 'linux/arm64' }}"
141-
run: |
142-
set -x
143-
144-
df -h
145-
146-
sudo apt-get update
147-
sudo apt-get purge -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*'
148-
sudo apt-get autoremove -y --purge
149-
sudo apt-get clean
150-
sudo rm -rf /usr/local/.ghcup &
151-
sudo rm -rf /usr/local/lib/android &
152-
sudo rm -rf /usr/local/share/boost &
153-
sudo rm -rf /usr/local/lib/node_modules &
154-
sudo rm -rf /usr/share/dotnet &
155-
sudo rm -rf /opt/ghc &
156-
sudo rm -rf /opt/hostedtoolcache/CodeQL &
157-
158-
sudo docker image prune --all --force &
159-
160-
wait
161-
162-
df -h
163142

164143
- id: install-compsize
165144
run: sudo apt-get install -y btrfs-compsize
@@ -398,164 +377,9 @@ jobs:
398377
ln -s ../rocm-tensorflow runtimes/rocm/tensorflow
399378
ln -s ../rocm-pytorch runtimes/rocm/pytorch
400379
401-
# https://cri-o.io/
402-
- name: Install cri-o
403-
id: install-crio
404-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
405-
run: |
406-
set -Eeuxo pipefail
407-
408-
# the Microsoft repo's kubelet does not provide /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
409-
# [Service]
410-
# EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
411-
# ExecStart=/usr/bin/kubelet $KUBELET_KUBEADM_ARGS
412-
sudo ls /etc/apt/sources.list.d/
413-
sudo rm /etc/apt/sources.list.d/microsoft-prod.list
414-
415-
sudo apt-get update
416-
sudo apt-get install -y software-properties-common curl
417-
418-
# https://github.com/cri-o/packaging?tab=readme-ov-file#distributions-using-deb-packages
419-
420-
curl -fsSL https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/Release.key | \
421-
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
422-
423-
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/ /" | \
424-
sudo tee /etc/apt/sources.list.d/kubernetes.list
425-
426-
curl -fsSL https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/Release.key | \
427-
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
428-
429-
echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/ /" | \
430-
sudo tee /etc/apt/sources.list.d/cri-o.list
431-
432-
sudo apt-get update
433-
434-
# [ERROR FileExisting-conntrack]: conntrack not found in system path
435-
# see man apt-patterns for the ~name=version* syntax
436-
437-
# The following packages will be DOWNGRADED:
438-
# kubectl
439-
# E: Packages were downgraded and -y was used without --allow-downgrades.
440-
441-
sudo apt-get install -y --allow-downgrades \
442-
"cri-o=${CRIO_VERSION}.*" \
443-
"kubelet=${KUBERNETES_VERSION}.*" "kubeadm=${KUBERNETES_VERSION}.*" "kubectl=${KUBERNETES_VERSION}.*" \
444-
conntrack
445-
446-
# make use of /etc/cni/net.d/11-crio-ipv4-bridge.conflist so we don't
447-
# need a pod network and just use the default bridge
448-
sudo rm -rf /etc/cni/net.d/*
449-
# cat /etc/cni/net.d/11-crio-ipv4-bridge.conflist
450-
# https://github.com/containerd/containerd/blob/main/script%2Fsetup%2Finstall-cni
451-
# https://www.cni.dev/plugins/current/main/bridge/
452-
sudo cp ci/cached-builds/11-crio-ipv4-bridge.conflist /etc/cni/net.d/11-crio-ipv4-bridge.conflist
453-
454-
sudo cp ci/cached-builds/crio.conf /etc/crio/crio.conf.d/
455-
456-
sudo systemctl daemon-reload
457-
sudo systemctl start crio.service
458-
env:
459-
# TODO(jdanek): install also "cri-tools=${CRIO_VERSION}.*" when updating to 1.33
460-
CRIO_VERSION: 1.32
461-
# This has to be kept in sync with the packages above, otherwise
462-
# [ERROR KubeletVersion]: the kubelet version is higher than the control plane version.
463-
# This is not a supported version skew and may lead to a malfunctional cluster.
464-
# Kubelet version: "1.33.0" Control plane version: "1.30.12"
465-
KUBERNETES_VERSION: 1.33
466-
# Also update version in kubeadm.yaml
467-
468-
- run: sudo crictl info
380+
- name: Provision K8s cluster
469381
if: ${{ steps.have-tests.outputs.tests == 'true' }}
470-
471-
- name: Show crio debug data (on failure)
472-
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
473-
run: |
474-
set -Eeuxo pipefail
475-
476-
sudo systemctl status crio.service || true
477-
sudo journalctl -xeu crio.service
478-
479-
# do this early, it's a good check that cri-o is not completely broken
480-
- name: "Show crio images information"
481-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
482-
run: sudo crictl images
483-
484-
- name: Install Kubernetes cluster
485-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
486-
run: |
487-
set -Eeuxo pipefail
488-
489-
sudo swapoff -a
490-
sudo modprobe br_netfilter
491-
sudo sysctl -w net.ipv4.ip_forward=1
492-
493-
# Was getting strange DNS resolution errors from pods that don't seem to want to go away sometimes:
494-
# Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Name or service not known.
495-
# wget: unable to resolve host address ‘raw.githubusercontent.com’
496-
# Here's what helped:
497-
# https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#known-issues
498-
# https://github.com/kubernetes/kubernetes/blob/e4c1f980b76fecece30c2f77885a7117192170a6/CHANGELOG/CHANGELOG-1.30.md?plain=1#L1454
499-
# https://github.com/canonical/microk8s/issues/68#issuecomment-404923563
500-
sudo ufw allow in on cni0
501-
sudo ufw allow out on cni0
502-
sudo ufw default allow routed
503-
sudo iptables -P FORWARD ACCEPT
504-
sudo iptables -t nat -A POSTROUTING -s 10.85.0.0/16 -o eth0 -j MASQUERADE
505-
506-
sudo kubeadm reset -f --cri-socket=unix:///var/run/crio/crio.sock
507-
508-
# https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm
509-
sudo kubeadm init --config=ci/cached-builds/kubeadm.yaml
510-
511-
mkdir -p $HOME/.kube
512-
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
513-
sudo chown $(id -u):$(id -g) $HOME/.kube/config
514-
515-
- name: Show kubelet debug data (on failure)
516-
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' && steps.install-crio.outcome == 'success' }}
517-
run: |
518-
set -Eeuxo pipefail
519-
520-
# [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
521-
sudo cat /var/lib/kubelet/kubeadm-flags.env || true
522-
# [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
523-
sudo cat /var/lib/kubelet/config.yaml || true
524-
525-
sudo systemctl cat kubelet.service || true
526-
527-
sudo cat /etc/systemd/system/kubelet.service.d/10-kubeadm.conf || true
528-
529-
sudo systemctl status kubelet || true
530-
sudo journalctl -xeu kubelet
531-
532-
# Here is one example how you may list all running Kubernetes containers by using crictl:
533-
sudo crictl --runtime-endpoint unix:///var/run/crio/crio.sock ps -a | grep kube | grep -v pause
534-
# Once you have found the failing container, you can inspect its logs with:
535-
# crictl --runtime-endpoint unix:///var/run/crio/crio.sock logs CONTAINERID
536-
537-
- name: Show nodes status and wait for readiness
538-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
539-
run: |
540-
kubectl describe nodes
541-
kubectl wait --for=condition=Ready nodes --all --timeout=100s || (kubectl describe nodes && false)
542-
543-
- name: Wait for pods to be running
544-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
545-
run: |
546-
set -Eeuxo pipefail
547-
kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
548-
kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
549-
550-
- name: "Install local-path provisioner"
551-
if: ${{ steps.have-tests.outputs.tests == 'true' }}
552-
run: |
553-
set -Eeuxo pipefail
554-
kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.31/deploy/local-path-storage.yaml
555-
kubectl wait deployments --all --namespace=local-path-storage --for=condition=Available --timeout=100s
556-
# https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/
557-
kubectl get storageclass
558-
kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
382+
uses: ./.github/actions/provision-k8s
559383

560384
- name: "Run image tests"
561385
# skip on s390x because we are unable to install requirements-elyra.txt that's installed by runtime image tests

0 commit comments

Comments
 (0)