|
1 | 1 | package nvidia |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "encoding/json" |
4 | 5 | "fmt" |
5 | | - "os" |
6 | | - "sort" |
7 | | - "time" |
8 | | - |
| 6 | + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" |
9 | 7 | log "github.com/golang/glog" |
10 | | - "k8s.io/apimachinery/pkg/labels" |
11 | | - |
12 | 8 | "k8s.io/api/core/v1" |
13 | 9 | "k8s.io/apimachinery/pkg/api/resource" |
14 | 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
15 | 11 | "k8s.io/apimachinery/pkg/fields" |
16 | | - |
| 12 | + "k8s.io/apimachinery/pkg/labels" |
17 | 13 | "k8s.io/apimachinery/pkg/types" |
18 | 14 | "k8s.io/client-go/kubernetes" |
19 | 15 | "k8s.io/client-go/rest" |
20 | 16 | "k8s.io/client-go/tools/clientcmd" |
21 | 17 | nodeutil "k8s.io/kubernetes/pkg/util/node" |
| 18 | + "os" |
| 19 | + "sort" |
| 20 | + "time" |
22 | 21 | ) |
23 | 22 |
|
24 | 23 | var ( |
25 | 24 | clientset *kubernetes.Clientset |
26 | 25 | nodeName string |
27 | | - retries = 5 |
| 26 | + retries = 8 |
28 | 27 | ) |
29 | 28 |
|
30 | 29 | func kubeInit() { |
@@ -58,18 +57,18 @@ func kubeInit() { |
58 | 57 | } |
59 | 58 |
|
60 | 59 | func disableCGPUIsolationOrNot() (bool, error) { |
61 | | - disable := false |
62 | | - node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) |
63 | | - if err != nil { |
64 | | - return disable, err |
65 | | - } |
66 | | - labels := node.ObjectMeta.Labels |
67 | | - value, ok := labels[EnvNodeLabelForDisableCGPU] |
68 | | - if ok && value == "true" { |
69 | | - log.Infof("enable gpusharing mode and disable cgpu mode") |
70 | | - disable = true |
71 | | - } |
72 | | - return disable, nil |
| 60 | + disable := false |
| 61 | + node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) |
| 62 | + if err != nil { |
| 63 | + return disable, err |
| 64 | + } |
| 65 | + labels := node.ObjectMeta.Labels |
| 66 | + value, ok := labels[EnvNodeLabelForDisableCGPU] |
| 67 | + if ok && value == "true" { |
| 68 | + log.Infof("enable gpusharing mode and disable cgpu mode") |
| 69 | + disable = true |
| 70 | + } |
| 71 | + return disable, nil |
73 | 72 | } |
74 | 73 |
|
75 | 74 | func patchGPUCount(gpuCount int) error { |
@@ -99,31 +98,90 @@ func patchGPUCount(gpuCount int) error { |
99 | 98 | return err |
100 | 99 | } |
101 | 100 |
|
102 | | -func getPendingPodsInNode() ([]v1.Pod, error) { |
103 | | - // pods, err := m.lister.List(labels.Everything()) |
104 | | - // if err != nil { |
105 | | - // return nil, err |
106 | | - // } |
107 | | - pods := []v1.Pod{} |
| 101 | +func getPodList(kubeletClient *client.KubeletClient) (*v1.PodList, error) { |
| 102 | + podList, err := kubeletClient.GetNodeRunningPods() |
| 103 | + if err != nil { |
| 104 | + return nil, err |
| 105 | + } |
108 | 106 |
|
109 | | - podIDMap := map[types.UID]bool{} |
| 107 | + list, _ := json.Marshal(podList) |
| 108 | + log.V(8).Infof("get pods list %v", string(list)) |
| 109 | + |
| 110 | + resultPodList := &v1.PodList{} |
| 111 | + for _, metaPod := range podList.Items { |
| 112 | + if metaPod.Status.Phase != v1.PodPending { |
| 113 | + continue |
| 114 | + } |
| 115 | + resultPodList.Items = append(resultPodList.Items, metaPod) |
| 116 | + } |
| 117 | + |
| 118 | + if len(resultPodList.Items) == 0 { |
| 119 | + return nil, fmt.Errorf("not found pending pod") |
| 120 | + } |
| 121 | + |
| 122 | + return resultPodList, nil |
| 123 | +} |
110 | 124 |
|
| 125 | +func getPodListsByQueryKubelet(kubeletClient *client.KubeletClient) (*v1.PodList, error) { |
| 126 | + podList, err := getPodList(kubeletClient) |
| 127 | + for i := 0; i < retries && err != nil; i++ { |
| 128 | + podList, err = getPodList(kubeletClient) |
| 129 | + log.Warningf("failed to get pending pod list, retry") |
| 130 | + time.Sleep(100 * time.Millisecond) |
| 131 | + } |
| 132 | + if err != nil { |
| 133 | + log.Warningf("not found from kubelet /pods api, start to list apiserver") |
| 134 | + podList, err = getPodListsByListAPIServer() |
| 135 | + if err != nil { |
| 136 | + return nil, err |
| 137 | + } |
| 138 | + } |
| 139 | + return podList, nil |
| 140 | +} |
| 141 | + |
| 142 | +func getPodListsByListAPIServer() (*v1.PodList, error) { |
111 | 143 | selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName, "status.phase": "Pending"}) |
112 | 144 | podList, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{ |
113 | 145 | FieldSelector: selector.String(), |
114 | 146 | LabelSelector: labels.Everything().String(), |
115 | 147 | }) |
116 | | - for i := 0; i < retries && err != nil; i++ { |
| 148 | + for i := 0; i < 3 && err != nil; i++ { |
117 | 149 | podList, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{ |
118 | 150 | FieldSelector: selector.String(), |
119 | 151 | LabelSelector: labels.Everything().String(), |
120 | 152 | }) |
121 | | - time.Sleep(100 * time.Millisecond) |
| 153 | + time.Sleep(1 * time.Second) |
122 | 154 | } |
123 | 155 | if err != nil { |
124 | 156 | return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) |
125 | 157 | } |
126 | 158 |
|
| 159 | + return podList, nil |
| 160 | +} |
| 161 | + |
| 162 | +func getPendingPodsInNode(queryKubelet bool, kubeletClient *client.KubeletClient) ([]v1.Pod, error) { |
| 163 | + // pods, err := m.lister.List(labels.Everything()) |
| 164 | + // if err != nil { |
| 165 | + // return nil, err |
| 166 | + // } |
| 167 | + pods := []v1.Pod{} |
| 168 | + |
| 169 | + podIDMap := map[types.UID]bool{} |
| 170 | + |
| 171 | + var podList *v1.PodList |
| 172 | + var err error |
| 173 | + if queryKubelet { |
| 174 | + podList, err = getPodListsByQueryKubelet(kubeletClient) |
| 175 | + if err != nil { |
| 176 | + return nil, err |
| 177 | + } |
| 178 | + } else { |
| 179 | + podList, err = getPodListsByListAPIServer() |
| 180 | + if err != nil { |
| 181 | + return nil, err |
| 182 | + } |
| 183 | + } |
| 184 | + |
127 | 185 | log.V(5).Infof("all pod list %v", podList.Items) |
128 | 186 |
|
129 | 187 | // if log.V(5) { |
@@ -154,9 +212,9 @@ func getPendingPodsInNode() ([]v1.Pod, error) { |
154 | 212 | } |
155 | 213 |
|
156 | 214 | // pick up the gpushare pod with assigned status is false, and |
157 | | -func getCandidatePods() ([]*v1.Pod, error) { |
| 215 | +func getCandidatePods(queryKubelet bool, client *client.KubeletClient) ([]*v1.Pod, error) { |
158 | 216 | candidatePods := []*v1.Pod{} |
159 | | - allPods, err := getPendingPodsInNode() |
| 217 | + allPods, err := getPendingPodsInNode(queryKubelet, client) |
160 | 218 | if err != nil { |
161 | 219 | return candidatePods, err |
162 | 220 | } |
|
0 commit comments