Skip to content

How to join a nvidia GPU node in kubeedge #454

@tangming1996

Description

@tangming1996

How to join a nvidia GPU node in kubeedge

1. Install the GPU Driver First
  • Install the corresponding driver according to the actual GPU model.
2. Install Docker or Containerd
3. Install Nvidia-Container-Toolkit
# If the server can access the external network, it can be installed directly in the following way.
https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
# If the server cannot access the external network, you can first download the offline installation package of nvidia-container-toolkit on GitHub. The website is as follows:
https://github.com/NVIDIA/nvidia-container-toolkit/releases
# Then unzip and enter the file directory. Use sudo apt install./* to install all packages as follows: (only tested on Ubuntu).
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# pwd
/root/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# ls
libnvidia-container1_1.16.0~rc.1-1_amd64.deb      libnvidia-container-tools_1.16.0~rc.1-1_amd64.deb      nvidia-container-toolkit-operator-extensions_1.16.0~rc.1-1_amd64.deb
libnvidia-container1-dbg_1.16.0~rc.1-1_amd64.deb  nvidia-container-toolkit_1.16.0~rc.1-1_amd64.deb
libnvidia-container-dev_1.16.0~rc.1-1_amd64.deb   nvidia-container-toolkit-base_1.16.0~rc.1-1_amd64.deb
root@edgenode:~/release-v1.16.0-rc.1-experimental/packages/ubuntu18.04/amd64# sudo apt install ./*
4. Configure Docker or Containerd to use nvidia-runtime
# After the installation of nvidia-container-toolkit is completed, nvidia-ctk can be used to configure nvidia-runtime.
# docker
sudo nvidia-ctk runtime configure --runtime=docker --set-as-default (docker)
# containerd
sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default (containerd)
5. Restart Docker or Containerd.
# docker:
systemctl daemon-reload && systemctl restart docker
# Check whether the runtime is modified successfully.
root@nano-desktop:~# docker info |grep Runtime
 Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc
 Default Runtime: nvidia

# containerd:
systemctl daemon-reload && systemctl restart containerd
root@edgenode:~# cat /etc/containerd/config.toml |grep nvidia
      default_runtime_name = "nvidia"
        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"
6、Join the KubeEdge Node
# Configure various parameters in the following connection command according to the actual environment
# docker:
keadm join --cgroupdriver=systemd \
	--cloudcore-ipport=10.31.226.13:30000 \
	--hub-protocol=websocket \
	--certport=30002 \
	--edgenode-name=nvidia-edge-node \
	--token=7aeef1a1b0020b608e112c4ac0084727d222811d18c4afbe4dc2446d82030ec5.eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3Mjk3MzYxMDF9.plL_BQtn9NxW5L2MyBe57ZKwSq6_0KopNSARdTD8IaY \
	--image-repository=docker.m.daocloud.io/kubeedge \
	--kubeedge-version=v1.17.0 \
	--remote-runtime-endpoint=unix:///var/run/cri-dockerd.sock

# containerd:
keadm join --cgroupdriver=cgroupfs \
	--cloudcore-ipport=10.31.226.13:30000 \
	--hub-protocol=websocket \
	--certport=30002 \
	--edgenode-name=nano-1iamih8np \
	--token=7aeef1a1b0020b608e112c4ac0084727d222811d18c4afbe4dc2446d82030ec5.eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3Mjk3MzYxMDF9.plL_BQtn9NxW5L2MyBe57ZKwSq6_0KopNSARdTD8IaY \
	--image-repository=docker.m.daocloud.io/kubeedge \
	--kubeedge-version=v1.17.0 \
	--remote-runtime-endpoint=unix:///run/containerd/containerd.sock
	
[root@master-01 ~]# kubectl get  node
NAME                  STATUS   ROLES                  AGE    VERSION
master-01             Ready    control-plane,master   333d   v1.23.0
nano-1iamih8np        Ready    agent,edge             2d6h   v1.28.6-kubeedge-v1.17.0
nvidia-edge-node      Ready    agent,edge             25m    v1.28.6-kubeedge-v1.17.0
7. Deploy the daemonset (k8s-device-plugin)
# Refer to the YAML. The key is to configure toleration so that pods can be scheduled to edge nodes.
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nvidia-device-plugin-daemonset
  namespace: kube-system
spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  template:
    metadata:
      labels:
        name: nvidia-device-plugin-ds
    spec:
      containers:
      - env:
        - name: FAIL_ON_INIT_ERROR
          value: "false"
        image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
        imagePullPolicy: IfNotPresent
        name: nvidia-device-plugin-ctr
        resources: {}
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop:
            - ALL
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /var/lib/kubelet/device-plugins
          name: device-plugin
      dnsPolicy: ClusterFirst
      priorityClassName: system-node-critical
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext: {}
      terminationGracePeriodSeconds: 30
      tolerations:
      - effect: NoSchedule
        key: nvidia.com/gpu
        operator: Exists
      - effect: NoSchedule
        key: node-role.kubernetes.io/edge
        operator: Exists
      - effect: NoSchedule
        key: k8s.io/nano
        operator: Exists
      volumes:
      - hostPath:
          path: /var/lib/kubelet/device-plugins
          type: ""
        name: device-plugin

# After deployment, check whether it is successfully deployed on the edge node
[root@master-01 ~]# kubectl get daemonsets.apps -n kube-system|grep nvidia
NAME                             DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR            AGE
nvidia-device-plugin-daemonset   2         2         2       2            2           <none>                   292d
[root@master-01 ~]# kubectl get po -n kube-system -owide|grep nvidia
nvidia-device-plugin-daemonset-d5nbc   1/1     Running   0                22m    10.88.0.4      nvidia-edge-node      <none>           <none>
nvidia-device-plugin-daemonset-qbwdd   1/1     Running   0                2d6h   10.88.0.2      nano-1iamih8np        <none>           <none>
8. Verify whether the GPU information is reported successfully
# Seeing the key of [nvidia.com/gpu] under the Capacity and Allocatable fields indicates that the device-plugin is deployed successfully and the GPU information of the node has been successfully reported.
[root@master-01 nvidia-test]# kubectl describe node nvidia-edge-node
Name:               nvidia-edge-node
Roles:              agent,edge
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=nvidia-edge-node
                    kubernetes.io/os=linux
                    node-role.kubernetes.io/agent=
                    node-role.kubernetes.io/edge=
                    node.kpanda.io/gpu-vendor=Nvidia-GPU
Annotations:        node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Wed, 23 Oct 2024 16:48:00 +0800
Taints:             node-role.kubernetes.io/edge:NoSchedule
Unschedulable:      false
Lease:
  HolderIdentity:  nvidia-edge-node
  AcquireTime:     <unset>
  RenewTime:       Wed, 23 Oct 2024 16:50:24 +0800
Conditions:
  Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----             ------  -----------------                 ------------------                ------                       -------
  MemoryPressure   False   Wed, 23 Oct 2024 16:48:21 +0800   Wed, 23 Oct 2024 16:48:00 +0800   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure     False   Wed, 23 Oct 2024 16:48:21 +0800   Wed, 23 Oct 2024 16:48:00 +0800   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure      False   Wed, 23 Oct 2024 16:48:21 +0800   Wed, 23 Oct 2024 16:48:00 +0800   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready            True    Wed, 23 Oct 2024 16:48:21 +0800   Wed, 23 Oct 2024 16:48:00 +0800   EdgeReady                    edge is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.64.24.29
  Hostname:    nvidia-edge-node
Capacity:
  cpu:                12
  ephemeral-storage:  143075484Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40917620Ki
  nvidia.com/gpu:     1
  pods:               110
Allocatable:
  cpu:                12
  ephemeral-storage:  131858365837
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40815220Ki
  nvidia.com/gpu:     1
  pods:               110
System Info:
  Machine ID:                 97b1ae8e10df4fcda613e21fa937a458
  System UUID:                cb30b8a3-e57c-aab9-3f9e-50ebf6580bd0
  Boot ID:                    dc38fe36-e342-43fc-afc4-dfc24512c23a
  Kernel Version:             6.8.0-45-generic
  OS Image:                   Ubuntu 24.04 LTS
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  docker://24.0.7
  Kubelet Version:            v1.28.6-kubeedge-v1.17.0
  Kube-Proxy Version:         v0.0.0-master+$Format:%H$
PodCIDR:                      10.244.15.0/24
PodCIDRs:                     10.244.15.0/24
Non-terminated Pods:          (4 in total)
  Namespace                   Name                                    CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
  ---------                   ----                                    ------------  ----------  ---------------  -------------  ---
  kube-system                 nvidia-device-plugin-daemonset-wqzrd    0 (0%)        0 (0%)      0 (0%)           0 (0%)         2m25s
  kubeedge                    edge-eclipse-mosquitto-cnfwb            100m (0%)     200m (1%)   64Mi (0%)        128Mi (0%)     2m25s
  kubeedge                    edgemesh-agent-4wbs6                    500m (4%)     1 (8%)      128Mi (0%)       256Mi (0%)     2m25s
  sedna                       lc-j6rvh                                100m (0%)     200m (1%)   32Mi (0%)        128Mi (0%)     2m25s
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests    Limits
  --------           --------    ------
  cpu                700m (5%)   1400m (11%)
  memory             224Mi (0%)  512Mi (1%)
  ephemeral-storage  0 (0%)      0 (0%)
  hugepages-1Gi      0 (0%)      0 (0%)
  hugepages-2Mi      0 (0%)      0 (0%)
  nvidia.com/gpu     0           0
Events:              <none>
9. Deploy an application for testing
# Test template
kind: Deployment
apiVersion: apps/v1
metadata:
  name: test-gpu
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: test-gpu
  template:
    metadata:
      labels:
        app: test-gpu
    spec:
      containers:
        - name: container-1
          image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
          command:
            - tail
            - '-f'
            - /dev/null
          resources:
            limits:
              nvidia.com/gpu: '1'
            requests:
              nvidia.com/gpu: '1'
          imagePullPolicy: IfNotPresent
      nodeName: nvidia-edge-node
      schedulerName: default-scheduler
      tolerations:
        - key: node-role.kubernetes.io/edge
          operator: Exists
          effect: NoSchedule
10. After the application is successfully deployed, you can verify whether the GPU is available on the corresponding node
# Enter the application container on the node and use the torch.cuda.is_available() method to judge. Returning True indicates that the GPU is successfully driven.
# docker
root@nano-desktop:~# docker ps
CONTAINER ID   IMAGE                       COMMAND                  CREATED          STATUS          PORTS     NAMES
e7e3804626a5   853b58c1dce6                "tail -f /dev/null"      53 seconds ago   Up 45 seconds             k8s_container-1_test-gpu-arm64-nano-7f8fd7f79f-hzvp5_default_64fb7a90-b0e6-4b46-a34f-8a06b24b9169_0
root@nano-desktop:~# docker exec -it e7e3804626a5 /bin/bash
root@test-gpu-arm64-nano-7f8fd7f79f-hzvp5:/# python3
Python 3.8.10 (default, Nov 14 2022, 12:59:47)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.cuda.is_available()
True

# containerd
root@edgenode:~# crictl ps
CONTAINER           IMAGE               CREATED             STATE               NAME                       ATTEMPT             POD ID              POD
de1f1e60abc0a       0dd75116a8ce8       2 minutes ago       Running             container-1                0                   6beffb412af3f       test-gpu-6bfbdc9449-jfbrl
root@edgenode:~# crictl exec -it de1f1e60abc0a /bin/bash
root@test-gpu-6bfbdc9449-jfbrl:/workspace# python3
Python 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.cuda.is_available()
True

Metadata

Metadata

Assignees

No one assigned

    Labels

    kind/questionIndicates an issue that is a support question.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions