-
Notifications
You must be signed in to change notification settings - Fork 169
Open
Labels
kind/questionIndicates an issue that is a support question.Indicates an issue that is a support question.
Description
How to join a nvidia GPU node in kubeedge
1. Install the GPU Driver First
- Install the corresponding driver according to the actual GPU model.
2. Install Docker or Containerd
- Note: If Docker is used, KubeEdge no longer directly supports it after version 1.12, and [cri-dockerd](https://github.com/Mirantis/cri-dockerd/releases) needs to be additionally installed for support.
3. Install Nvidia-Container-Toolkit
# If the server can access the external network, it can be installed directly in the following way.
https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
# If the server cannot access the external network, you can first download the offline installation package of nvidia-container-toolkit on GitHub. The website is as follows:
https://github.com/NVIDIA/nvidia-container-toolkit/releases
# Then unzip and enter the file directory. Use sudo apt install./* to install all packages as follows: (only tested on Ubuntu).
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# pwd
/root/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# ls
libnvidia-container1_1.16.0~rc.1-1_amd64.deb libnvidia-container-tools_1.16.0~rc.1-1_amd64.deb nvidia-container-toolkit-operator-extensions_1.16.0~rc.1-1_amd64.deb
libnvidia-container1-dbg_1.16.0~rc.1-1_amd64.deb nvidia-container-toolkit_1.16.0~rc.1-1_amd64.deb
libnvidia-container-dev_1.16.0~rc.1-1_amd64.deb nvidia-container-toolkit-base_1.16.0~rc.1-1_amd64.deb
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# sudo apt install ./*4. Configure Docker or Containerd to use nvidia-runtime
# After the installation of nvidia-container-toolkit is completed, nvidia-ctk can be used to configure nvidia-runtime.
# docker
sudo nvidia-ctk runtime configure --runtime=docker --set-as-default (docker)
# containerd
sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default (containerd)5. Restart Docker or Containerd.
# docker:
systemctl daemon-reload && systemctl restart docker
# Check whether the runtime is modified successfully.
root@nano-desktop:~# docker info |grep Runtime
Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc
Default Runtime: nvidia
# containerd:
systemctl daemon-reload && systemctl restart containerd
root@edgenode:~# cat /etc/containerd/config.toml |grep nvidia
default_runtime_name = "nvidia"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"6、Join the KubeEdge Node
# Configure various parameters in the following connection command according to the actual environment
# docker:
keadm join --cgroupdriver=systemd \
--cloudcore-ipport=10.31.226.13:30000 \
--hub-protocol=websocket \
--certport=30002 \
--edgenode-name=nvidia-edge-node \
--token=7aeef1a1b0020b608e112c4ac0084727d222811d18c4afbe4dc2446d82030ec5.eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3Mjk3MzYxMDF9.plL_BQtn9NxW5L2MyBe57ZKwSq6_0KopNSARdTD8IaY \
--image-repository=docker.m.daocloud.io/kubeedge \
--kubeedge-version=v1.17.0 \
--remote-runtime-endpoint=unix:///var/run/cri-dockerd.sock
# containerd:
keadm join --cgroupdriver=cgroupfs \
--cloudcore-ipport=10.31.226.13:30000 \
--hub-protocol=websocket \
--certport=30002 \
--edgenode-name=nano-1iamih8np \
--token=7aeef1a1b0020b608e112c4ac0084727d222811d18c4afbe4dc2446d82030ec5.eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3Mjk3MzYxMDF9.plL_BQtn9NxW5L2MyBe57ZKwSq6_0KopNSARdTD8IaY \
--image-repository=docker.m.daocloud.io/kubeedge \
--kubeedge-version=v1.17.0 \
--remote-runtime-endpoint=unix:///run/containerd/containerd.sock
[root@master-01 ~]# kubectl get node
NAME STATUS ROLES AGE VERSION
master-01 Ready control-plane,master 333d v1.23.0
nano-1iamih8np Ready agent,edge 2d6h v1.28.6-kubeedge-v1.17.0
nvidia-edge-node Ready agent,edge 25m v1.28.6-kubeedge-v1.17.07. Deploy the daemonset (k8s-device-plugin)
# Refer to the YAML. The key is to configure toleration so that pods can be scheduled to edge nodes.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
name: nvidia-device-plugin-ds
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
containers:
- env:
- name: FAIL_ON_INIT_ERROR
value: "false"
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
imagePullPolicy: IfNotPresent
name: nvidia-device-plugin-ctr
resources: {}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
dnsPolicy: ClusterFirst
priorityClassName: system-node-critical
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: node-role.kubernetes.io/edge
operator: Exists
- effect: NoSchedule
key: k8s.io/nano
operator: Exists
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: device-plugin
# After deployment, check whether it is successfully deployed on the edge node
[root@master-01 ~]# kubectl get daemonsets.apps -n kube-system|grep nvidia
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
nvidia-device-plugin-daemonset 2 2 2 2 2 <none> 292d
[root@master-01 ~]# kubectl get po -n kube-system -owide|grep nvidia
nvidia-device-plugin-daemonset-d5nbc 1/1 Running 0 22m 10.88.0.4 nvidia-edge-node <none> <none>
nvidia-device-plugin-daemonset-qbwdd 1/1 Running 0 2d6h 10.88.0.2 nano-1iamih8np <none> <none>8. Verify whether the GPU information is reported successfully
# Seeing the key of [nvidia.com/gpu] under the Capacity and Allocatable fields indicates that the device-plugin is deployed successfully and the GPU information of the node has been successfully reported.
[root@master-01 nvidia-test]# kubectl describe node nvidia-edge-node
Name: nvidia-edge-node
Roles: agent,edge
Labels: beta.kubernetes.io/arch=amd64
beta.kubernetes.io/os=linux
kubernetes.io/arch=amd64
kubernetes.io/hostname=nvidia-edge-node
kubernetes.io/os=linux
node-role.kubernetes.io/agent=
node-role.kubernetes.io/edge=
node.kpanda.io/gpu-vendor=Nvidia-GPU
Annotations: node.alpha.kubernetes.io/ttl: 0
volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp: Wed, 23 Oct 2024 16:48:00 +0800
Taints: node-role.kubernetes.io/edge:NoSchedule
Unschedulable: false
Lease:
HolderIdentity: nvidia-edge-node
AcquireTime: <unset>
RenewTime: Wed, 23 Oct 2024 16:50:24 +0800
Conditions:
Type Status LastHeartbeatTime LastTransitionTime Reason Message
---- ------ ----------------- ------------------ ------ -------
MemoryPressure False Wed, 23 Oct 2024 16:48:21 +0800 Wed, 23 Oct 2024 16:48:00 +0800 KubeletHasSufficientMemory kubelet has sufficient memory available
DiskPressure False Wed, 23 Oct 2024 16:48:21 +0800 Wed, 23 Oct 2024 16:48:00 +0800 KubeletHasNoDiskPressure kubelet has no disk pressure
PIDPressure False Wed, 23 Oct 2024 16:48:21 +0800 Wed, 23 Oct 2024 16:48:00 +0800 KubeletHasSufficientPID kubelet has sufficient PID available
Ready True Wed, 23 Oct 2024 16:48:21 +0800 Wed, 23 Oct 2024 16:48:00 +0800 EdgeReady edge is posting ready status. AppArmor enabled
Addresses:
InternalIP: 10.64.24.29
Hostname: nvidia-edge-node
Capacity:
cpu: 12
ephemeral-storage: 143075484Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 40917620Ki
nvidia.com/gpu: 1
pods: 110
Allocatable:
cpu: 12
ephemeral-storage: 131858365837
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 40815220Ki
nvidia.com/gpu: 1
pods: 110
System Info:
Machine ID: 97b1ae8e10df4fcda613e21fa937a458
System UUID: cb30b8a3-e57c-aab9-3f9e-50ebf6580bd0
Boot ID: dc38fe36-e342-43fc-afc4-dfc24512c23a
Kernel Version: 6.8.0-45-generic
OS Image: Ubuntu 24.04 LTS
Operating System: linux
Architecture: amd64
Container Runtime Version: docker://24.0.7
Kubelet Version: v1.28.6-kubeedge-v1.17.0
Kube-Proxy Version: v0.0.0-master+$Format:%H$
PodCIDR: 10.244.15.0/24
PodCIDRs: 10.244.15.0/24
Non-terminated Pods: (4 in total)
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits Age
--------- ---- ------------ ---------- --------------- ------------- ---
kube-system nvidia-device-plugin-daemonset-wqzrd 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2m25s
kubeedge edge-eclipse-mosquitto-cnfwb 100m (0%) 200m (1%) 64Mi (0%) 128Mi (0%) 2m25s
kubeedge edgemesh-agent-4wbs6 500m (4%) 1 (8%) 128Mi (0%) 256Mi (0%) 2m25s
sedna lc-j6rvh 100m (0%) 200m (1%) 32Mi (0%) 128Mi (0%) 2m25s
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 700m (5%) 1400m (11%)
memory 224Mi (0%) 512Mi (1%)
ephemeral-storage 0 (0%) 0 (0%)
hugepages-1Gi 0 (0%) 0 (0%)
hugepages-2Mi 0 (0%) 0 (0%)
nvidia.com/gpu 0 0
Events: <none>9. Deploy an application for testing
# Test template
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-gpu
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: test-gpu
template:
metadata:
labels:
app: test-gpu
spec:
containers:
- name: container-1
image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
command:
- tail
- '-f'
- /dev/null
resources:
limits:
nvidia.com/gpu: '1'
requests:
nvidia.com/gpu: '1'
imagePullPolicy: IfNotPresent
nodeName: nvidia-edge-node
schedulerName: default-scheduler
tolerations:
- key: node-role.kubernetes.io/edge
operator: Exists
effect: NoSchedule10. After the application is successfully deployed, you can verify whether the GPU is available on the corresponding node
# Enter the application container on the node and use the torch.cuda.is_available() method to judge. Returning True indicates that the GPU is successfully driven.
# docker
root@nano-desktop:~# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
e7e3804626a5 853b58c1dce6 "tail -f /dev/null" 53 seconds ago Up 45 seconds k8s_container-1_test-gpu-arm64-nano-7f8fd7f79f-hzvp5_default_64fb7a90-b0e6-4b46-a34f-8a06b24b9169_0
root@nano-desktop:~# docker exec -it e7e3804626a5 /bin/bash
root@test-gpu-arm64-nano-7f8fd7f79f-hzvp5:/# python3
Python 3.8.10 (default, Nov 14 2022, 12:59:47)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.cuda.is_available()
True
# containerd
root@edgenode:~# crictl ps
CONTAINER IMAGE CREATED STATE NAME ATTEMPT POD ID POD
de1f1e60abc0a 0dd75116a8ce8 2 minutes ago Running container-1 0 6beffb412af3f test-gpu-6bfbdc9449-jfbrl
root@edgenode:~# crictl exec -it de1f1e60abc0a /bin/bash
root@test-gpu-6bfbdc9449-jfbrl:/workspace# python3
Python 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.cuda.is_available()
TrueReactions are currently unavailable
Metadata
Metadata
Assignees
Labels
kind/questionIndicates an issue that is a support question.Indicates an issue that is a support question.