Skip to content

Commit e33390a

Browse files
Fixing pcidevice device plugin stop deadlock
Signed-off-by: Webber Huang <webber.huang@suse.com>
1 parent 82535bf commit e33390a

File tree

3 files changed

+15
-9
lines changed

3 files changed

+15
-9
lines changed

pkg/controller/nodes/node_controller.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"reflect"
88
"time"
99

10-
"github.com/harvester/pcidevices/pkg/controller/gpudevice"
11-
1210
ctlnetworkv1beta1 "github.com/harvester/harvester-network-controller/pkg/generated/controllers/network.harvesterhci.io/v1beta1"
1311
"github.com/jaypipes/ghw"
1412
ctlcorev1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1"
@@ -17,6 +15,7 @@ import (
1715
"k8s.io/apimachinery/pkg/labels"
1816

1917
"github.com/harvester/pcidevices/pkg/apis/devices.harvesterhci.io/v1beta1"
18+
"github.com/harvester/pcidevices/pkg/controller/gpudevice"
2019
"github.com/harvester/pcidevices/pkg/controller/pcidevice"
2120
"github.com/harvester/pcidevices/pkg/controller/sriovdevice"
2221
ctl "github.com/harvester/pcidevices/pkg/generated/controllers/devices.harvesterhci.io/v1beta1"

pkg/deviceplugins/device_manager.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ const (
4545
pciBasePath = "/sys/bus/pci/devices"
4646
connectionTimeout = 120 * time.Second // Google gRPC default timeout
4747
PCIResourcePrefix = "PCI_RESOURCE"
48+
tickerTimeout = 30 * time.Second
4849
)
4950

5051
type PCIDevice struct {
@@ -293,7 +294,7 @@ func (dp *PCIDevicePlugin) Allocate(_ context.Context, r *pluginapi.AllocateRequ
293294

294295
func (dp *PCIDevicePlugin) healthCheck() error {
295296
logger := log.DefaultLogger()
296-
monitoredDevices := make(map[string]string)
297+
monitoredDevices := map[string]string{}
297298
watcher, err := fsnotify.NewWatcher()
298299
if err != nil {
299300
return fmt.Errorf("failed to creating a fsnotify watcher: %v", err)
@@ -352,6 +353,8 @@ func (dp *PCIDevicePlugin) healthCheck() error {
352353
select {
353354
case <-dp.stop:
354355
return nil
356+
case <-dp.done:
357+
return nil
355358
case err := <-watcher.Errors:
356359
logger.Reason(err).Errorf("error watching devices and device plugin directory")
357360
case event := <-watcher.Events:
@@ -389,14 +392,12 @@ func (dp *PCIDevicePlugin) GetDeviceName() string {
389392

390393
// Stop stops the gRPC server
391394
func (dp *PCIDevicePlugin) stopDevicePlugin() error {
392-
defer func() {
393-
if !IsChanClosed(dp.done) {
394-
close(dp.done)
395-
}
396-
}()
395+
if !IsChanClosed(dp.done) {
396+
close(dp.done)
397+
}
397398

398399
// Give the device plugin one second to properly deregister
399-
ticker := time.NewTicker(1 * time.Second)
400+
ticker := time.NewTicker(tickerTimeout)
400401
defer ticker.Stop()
401402
select {
402403
case <-dp.deregistered:

pkg/deviceplugins/deviceplugin.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,11 @@ func (dp *PCIDevicePlugin) RemoveDevice(pd *v1beta1.PCIDevice, pdc *v1beta1.PCID
9292
logrus.Infof("Removing %s from device plugin", resourceName)
9393
dp.MarkPCIDeviceAsUnhealthy(pdc.Spec.Address)
9494
}
95+
96+
for i, dev := range dp.devs {
97+
if dev.ID == pdc.Spec.Address {
98+
dp.devs[i].Health = pluginapi.Unhealthy
99+
}
100+
}
95101
return nil
96102
}

0 commit comments

Comments
 (0)