Skip to content

Commit a42eb58

Browse files
Fixing pcidevice device plugin stop deadlock
Signed-off-by: Webber Huang <webber.huang@suse.com> Fixing codeFactor "Complex Method" in pcidevice plugin healthcheck()
1 parent 82535bf commit a42eb58

File tree

3 files changed

+23
-14
lines changed

3 files changed

+23
-14
lines changed

pkg/controller/nodes/node_controller.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"reflect"
88
"time"
99

10-
"github.com/harvester/pcidevices/pkg/controller/gpudevice"
11-
1210
ctlnetworkv1beta1 "github.com/harvester/harvester-network-controller/pkg/generated/controllers/network.harvesterhci.io/v1beta1"
1311
"github.com/jaypipes/ghw"
1412
ctlcorev1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1"
@@ -17,6 +15,7 @@ import (
1715
"k8s.io/apimachinery/pkg/labels"
1816

1917
"github.com/harvester/pcidevices/pkg/apis/devices.harvesterhci.io/v1beta1"
18+
"github.com/harvester/pcidevices/pkg/controller/gpudevice"
2019
"github.com/harvester/pcidevices/pkg/controller/pcidevice"
2120
"github.com/harvester/pcidevices/pkg/controller/sriovdevice"
2221
ctl "github.com/harvester/pcidevices/pkg/generated/controllers/devices.harvesterhci.io/v1beta1"

pkg/deviceplugins/device_manager.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ const (
4545
pciBasePath = "/sys/bus/pci/devices"
4646
connectionTimeout = 120 * time.Second // Google gRPC default timeout
4747
PCIResourcePrefix = "PCI_RESOURCE"
48+
tickerTimeout = 30 * time.Second
4849
)
4950

5051
type PCIDevice struct {
@@ -292,7 +293,6 @@ func (dp *PCIDevicePlugin) Allocate(_ context.Context, r *pluginapi.AllocateRequ
292293
}
293294

294295
func (dp *PCIDevicePlugin) healthCheck() error {
295-
logger := log.DefaultLogger()
296296
monitoredDevices := make(map[string]string)
297297
watcher, err := fsnotify.NewWatcher()
298298
if err != nil {
@@ -348,31 +348,37 @@ func (dp *PCIDevicePlugin) healthCheck() error {
348348
return fmt.Errorf("failed to watch device-plugin socket: %v", err)
349349
}
350350

351+
return dp.performCheck(monitoredDevices, watcher)
352+
}
353+
354+
func (dp *PCIDevicePlugin) performCheck(monitoredDevices map[string]string, watcher *fsnotify.Watcher) error {
351355
for {
352356
select {
353357
case <-dp.stop:
354358
return nil
359+
case <-dp.done:
360+
return nil
355361
case err := <-watcher.Errors:
356-
logger.Reason(err).Errorf("error watching devices and device plugin directory")
362+
logrus.Errorf("error watching devices and device plugin directory: %v", err)
357363
case event := <-watcher.Events:
358-
logger.V(4).Infof("health Event: %v", event)
364+
logrus.Infof("health Event: %v", event)
359365
if monDevID, exist := monitoredDevices[event.Name]; exist {
360366
// Health in this case is if the device path actually exists
361367
if event.Op == fsnotify.Create {
362-
logger.Infof("monitored device %s appeared", dp.resourceName)
368+
logrus.Infof("monitored device %s appeared", dp.resourceName)
363369
dp.health <- deviceHealth{
364370
DevID: monDevID,
365371
Health: pluginapi.Healthy,
366372
}
367373
} else if (event.Op == fsnotify.Remove) || (event.Op == fsnotify.Rename) {
368-
logger.Infof("monitored device %s disappeared", dp.resourceName)
374+
logrus.Infof("monitored device %s disappeared", dp.resourceName)
369375
dp.health <- deviceHealth{
370376
DevID: monDevID,
371377
Health: pluginapi.Unhealthy,
372378
}
373379
}
374380
} else if event.Name == dp.socketPath && event.Op == fsnotify.Remove {
375-
logger.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName)
381+
logrus.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName)
376382
return nil
377383
}
378384
}
@@ -389,14 +395,12 @@ func (dp *PCIDevicePlugin) GetDeviceName() string {
389395

390396
// Stop stops the gRPC server
391397
func (dp *PCIDevicePlugin) stopDevicePlugin() error {
392-
defer func() {
393-
if !IsChanClosed(dp.done) {
394-
close(dp.done)
395-
}
396-
}()
398+
if !IsChanClosed(dp.done) {
399+
close(dp.done)
400+
}
397401

398402
// Give the device plugin one second to properly deregister
399-
ticker := time.NewTicker(1 * time.Second)
403+
ticker := time.NewTicker(tickerTimeout)
400404
defer ticker.Stop()
401405
select {
402406
case <-dp.deregistered:

pkg/deviceplugins/deviceplugin.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,11 @@ func (dp *PCIDevicePlugin) RemoveDevice(pd *v1beta1.PCIDevice, pdc *v1beta1.PCID
9292
logrus.Infof("Removing %s from device plugin", resourceName)
9393
dp.MarkPCIDeviceAsUnhealthy(pdc.Spec.Address)
9494
}
95+
96+
for i, dev := range dp.devs {
97+
if dev.ID == pdc.Spec.Address {
98+
dp.devs[i].Health = pluginapi.Unhealthy
99+
}
100+
}
95101
return nil
96102
}

0 commit comments

Comments
 (0)