@@ -133,33 +133,12 @@ jobs:
133
133
# region Free up disk space
134
134
135
135
- name : Free up additional disk space
136
+ uses : ./.github/actions/free-up-disk-space
136
137
# https://docs.github.com/en/actions/learn-github-actions/expressions
137
138
# NOTE: the arm64 GitHub hosted runner does not have the /mnt-mounted scratch disk
138
139
if : " ${{ contains(inputs.target, 'rocm') || contains(inputs.target, 'cuda') ||
139
140
contains(inputs.target, 'pytorch') || contains(inputs.target, 'tensorflow') ||
140
141
inputs.platform == 'linux/arm64' }}"
141
- run : |
142
- set -x
143
-
144
- df -h
145
-
146
- sudo apt-get update
147
- sudo apt-get purge -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*'
148
- sudo apt-get autoremove -y --purge
149
- sudo apt-get clean
150
- sudo rm -rf /usr/local/.ghcup &
151
- sudo rm -rf /usr/local/lib/android &
152
- sudo rm -rf /usr/local/share/boost &
153
- sudo rm -rf /usr/local/lib/node_modules &
154
- sudo rm -rf /usr/share/dotnet &
155
- sudo rm -rf /opt/ghc &
156
- sudo rm -rf /opt/hostedtoolcache/CodeQL &
157
-
158
- sudo docker image prune --all --force &
159
-
160
- wait
161
-
162
- df -h
163
142
164
143
- id : install-compsize
165
144
run : sudo apt-get install -y btrfs-compsize
@@ -398,164 +377,9 @@ jobs:
398
377
ln -s ../rocm-tensorflow runtimes/rocm/tensorflow
399
378
ln -s ../rocm-pytorch runtimes/rocm/pytorch
400
379
401
- # https://cri-o.io/
402
- - name : Install cri-o
403
- id : install-crio
404
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
405
- run : |
406
- set -Eeuxo pipefail
407
-
408
- # the Microsoft repo's kubelet does not provide /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
409
- # [Service]
410
- # EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
411
- # ExecStart=/usr/bin/kubelet $KUBELET_KUBEADM_ARGS
412
- sudo ls /etc/apt/sources.list.d/
413
- sudo rm /etc/apt/sources.list.d/microsoft-prod.list
414
-
415
- sudo apt-get update
416
- sudo apt-get install -y software-properties-common curl
417
-
418
- # https://github.com/cri-o/packaging?tab=readme-ov-file#distributions-using-deb-packages
419
-
420
- curl -fsSL https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/Release.key | \
421
- sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
422
-
423
- echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${KUBERNETES_VERSION}/deb/ /" | \
424
- sudo tee /etc/apt/sources.list.d/kubernetes.list
425
-
426
- curl -fsSL https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/Release.key | \
427
- sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
428
-
429
- echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v${CRIO_VERSION}/deb/ /" | \
430
- sudo tee /etc/apt/sources.list.d/cri-o.list
431
-
432
- sudo apt-get update
433
-
434
- # [ERROR FileExisting-conntrack]: conntrack not found in system path
435
- # see man apt-patterns for the ~name=version* syntax
436
-
437
- # The following packages will be DOWNGRADED:
438
- # kubectl
439
- # E: Packages were downgraded and -y was used without --allow-downgrades.
440
-
441
- sudo apt-get install -y --allow-downgrades \
442
- "cri-o=${CRIO_VERSION}.*" \
443
- "kubelet=${KUBERNETES_VERSION}.*" "kubeadm=${KUBERNETES_VERSION}.*" "kubectl=${KUBERNETES_VERSION}.*" \
444
- conntrack
445
-
446
- # make use of /etc/cni/net.d/11-crio-ipv4-bridge.conflist so we don't
447
- # need a pod network and just use the default bridge
448
- sudo rm -rf /etc/cni/net.d/*
449
- # cat /etc/cni/net.d/11-crio-ipv4-bridge.conflist
450
- # https://github.com/containerd/containerd/blob/main/script%2Fsetup%2Finstall-cni
451
- # https://www.cni.dev/plugins/current/main/bridge/
452
- sudo cp ci/cached-builds/11-crio-ipv4-bridge.conflist /etc/cni/net.d/11-crio-ipv4-bridge.conflist
453
-
454
- sudo cp ci/cached-builds/crio.conf /etc/crio/crio.conf.d/
455
-
456
- sudo systemctl daemon-reload
457
- sudo systemctl start crio.service
458
- env :
459
- # TODO(jdanek): install also "cri-tools=${CRIO_VERSION}.*" when updating to 1.33
460
- CRIO_VERSION : 1.32
461
- # This has to be kept in sync with the packages above, otherwise
462
- # [ERROR KubeletVersion]: the kubelet version is higher than the control plane version.
463
- # This is not a supported version skew and may lead to a malfunctional cluster.
464
- # Kubelet version: "1.33.0" Control plane version: "1.30.12"
465
- KUBERNETES_VERSION : 1.33
466
- # Also update version in kubeadm.yaml
467
-
468
- - run : sudo crictl info
380
+ - name : Provision K8s cluster
469
381
if : ${{ steps.have-tests.outputs.tests == 'true' }}
470
-
471
- - name : Show crio debug data (on failure)
472
- if : ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
473
- run : |
474
- set -Eeuxo pipefail
475
-
476
- sudo systemctl status crio.service || true
477
- sudo journalctl -xeu crio.service
478
-
479
- # do this early, it's a good check that cri-o is not completely broken
480
- - name : " Show crio images information"
481
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
482
- run : sudo crictl images
483
-
484
- - name : Install Kubernetes cluster
485
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
486
- run : |
487
- set -Eeuxo pipefail
488
-
489
- sudo swapoff -a
490
- sudo modprobe br_netfilter
491
- sudo sysctl -w net.ipv4.ip_forward=1
492
-
493
- # Was getting strange DNS resolution errors from pods that don't seem to want to go away sometimes:
494
- # Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Name or service not known.
495
- # wget: unable to resolve host address ‘raw.githubusercontent.com’
496
- # Here's what helped:
497
- # https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#known-issues
498
- # https://github.com/kubernetes/kubernetes/blob/e4c1f980b76fecece30c2f77885a7117192170a6/CHANGELOG/CHANGELOG-1.30.md?plain=1#L1454
499
- # https://github.com/canonical/microk8s/issues/68#issuecomment-404923563
500
- sudo ufw allow in on cni0
501
- sudo ufw allow out on cni0
502
- sudo ufw default allow routed
503
- sudo iptables -P FORWARD ACCEPT
504
- sudo iptables -t nat -A POSTROUTING -s 10.85.0.0/16 -o eth0 -j MASQUERADE
505
-
506
- sudo kubeadm reset -f --cri-socket=unix:///var/run/crio/crio.sock
507
-
508
- # https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm
509
- sudo kubeadm init --config=ci/cached-builds/kubeadm.yaml
510
-
511
- mkdir -p $HOME/.kube
512
- sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
513
- sudo chown $(id -u):$(id -g) $HOME/.kube/config
514
-
515
- - name : Show kubelet debug data (on failure)
516
- if : ${{ failure() && steps.have-tests.outputs.tests == 'true' && steps.install-crio.outcome == 'success' }}
517
- run : |
518
- set -Eeuxo pipefail
519
-
520
- # [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
521
- sudo cat /var/lib/kubelet/kubeadm-flags.env || true
522
- # [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
523
- sudo cat /var/lib/kubelet/config.yaml || true
524
-
525
- sudo systemctl cat kubelet.service || true
526
-
527
- sudo cat /etc/systemd/system/kubelet.service.d/10-kubeadm.conf || true
528
-
529
- sudo systemctl status kubelet || true
530
- sudo journalctl -xeu kubelet
531
-
532
- # Here is one example how you may list all running Kubernetes containers by using crictl:
533
- sudo crictl --runtime-endpoint unix:///var/run/crio/crio.sock ps -a | grep kube | grep -v pause
534
- # Once you have found the failing container, you can inspect its logs with:
535
- # crictl --runtime-endpoint unix:///var/run/crio/crio.sock logs CONTAINERID
536
-
537
- - name : Show nodes status and wait for readiness
538
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
539
- run : |
540
- kubectl describe nodes
541
- kubectl wait --for=condition=Ready nodes --all --timeout=100s || (kubectl describe nodes && false)
542
-
543
- - name : Wait for pods to be running
544
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
545
- run : |
546
- set -Eeuxo pipefail
547
- kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
548
- kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
549
-
550
- - name : " Install local-path provisioner"
551
- if : ${{ steps.have-tests.outputs.tests == 'true' }}
552
- run : |
553
- set -Eeuxo pipefail
554
- kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.31/deploy/local-path-storage.yaml
555
- kubectl wait deployments --all --namespace=local-path-storage --for=condition=Available --timeout=100s
556
- # https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/
557
- kubectl get storageclass
558
- kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
382
+ uses : ./.github/actions/provision-k8s
559
383
560
384
- name : " Run image tests"
561
385
# skip on s390x because we are unable to install requirements-elyra.txt that's installed by runtime image tests
0 commit comments