@@ -26,6 +26,7 @@ import (
2626 "github.com/Azure/azure-container-networking/cns/cnireconciler"
2727 "github.com/Azure/azure-container-networking/cns/common"
2828 "github.com/Azure/azure-container-networking/cns/configuration"
29+ "github.com/Azure/azure-container-networking/cns/deviceplugin"
2930 "github.com/Azure/azure-container-networking/cns/endpointmanager"
3031 "github.com/Azure/azure-container-networking/cns/fsnotify"
3132 "github.com/Azure/azure-container-networking/cns/grpc"
@@ -65,6 +66,7 @@ import (
6566 "github.com/Azure/azure-container-networking/store"
6667 "github.com/Azure/azure-container-networking/telemetry"
6768 "github.com/avast/retry-go/v4"
69+ "github.com/google/go-cmp/cmp"
6870 "github.com/pkg/errors"
6971 "go.uber.org/zap"
7072 "go.uber.org/zap/zapcore"
@@ -105,9 +107,14 @@ const (
105107 // envVarEnableCNIConflistGeneration enables cni conflist generation if set (value doesn't matter)
106108 envVarEnableCNIConflistGeneration = "CNS_ENABLE_CNI_CONFLIST_GENERATION"
107109
108- cnsReqTimeout = 15 * time .Second
109- defaultLocalServerIP = "localhost"
110- defaultLocalServerPort = "10090"
110+ cnsReqTimeout = 15 * time .Second
111+ defaultLocalServerIP = "localhost"
112+ defaultLocalServerPort = "10090"
113+ defaultDevicePluginRetryInterval = 2 * time .Second
114+ defaultNodeInfoCRDPollInterval = 5 * time .Second
115+ defaultDevicePluginMaxRetryCount = 5
116+ initialVnetNICCount = 0
117+ initialIBNICCount = 0
111118)
112119
113120type cniConflistScenario string
@@ -910,6 +917,50 @@ func main() {
910917 }
911918 }
912919
920+ if cnsconfig .EnableSwiftV2 && cnsconfig .EnableK8sDevicePlugin {
921+ // Create device plugin manager instance
922+ pluginManager := deviceplugin .NewPluginManager (z )
923+ pluginManager .AddPlugin (mtv1alpha1 .DeviceTypeVnetNIC , initialVnetNICCount )
924+ pluginManager .AddPlugin (mtv1alpha1 .DeviceTypeInfiniBandNIC , initialIBNICCount )
925+
926+ ctx , cancel := context .WithCancel (context .Background ())
927+ defer cancel ()
928+
929+ // Start device plugin manager in a separate goroutine
930+ go func () {
931+ retryCount := 0
932+ ticker := time .NewTicker (defaultDevicePluginRetryInterval )
933+ // Ensure the ticker is stopped on exit
934+ defer ticker .Stop ()
935+ for {
936+ select {
937+ case <- ctx .Done ():
938+ z .Info ("Context canceled, stopping plugin manager" )
939+ return
940+ case <- ticker .C :
941+ if pluginErr := pluginManager .Run (ctx ); pluginErr != nil {
942+ z .Error ("plugin manager exited with error" , zap .Error (pluginErr ))
943+ retryCount ++
944+ // Implementing a basic circuit breaker
945+ if retryCount >= defaultDevicePluginMaxRetryCount {
946+ z .Error ("Max retries reached, stopping plugin manager" )
947+ return
948+ }
949+ } else {
950+ return
951+ }
952+ }
953+ }
954+ }()
955+
956+ // go routine to poll node info crd and update device counts
957+ go func () {
958+ if pollErr := pollNodeInfoCRDAndUpdatePlugin (ctx , z , pluginManager ); pollErr != nil {
959+ z .Error ("Error in pollNodeInfoCRDAndUpdatePlugin" , zap .Error (pollErr ))
960+ }
961+ }()
962+ }
963+
913964 // Conditionally initialize and start the gRPC server
914965 if cnsconfig .GRPCSettings .Enable {
915966 // Define gRPC server settings
@@ -1083,6 +1134,91 @@ func main() {
10831134 logger .Close ()
10841135}
10851136
1137+ // Poll CRD until it's set and update PluginManager
1138+ func pollNodeInfoCRDAndUpdatePlugin (ctx context.Context , zlog * zap.Logger , pluginManager * deviceplugin.PluginManager ) error {
1139+ kubeConfig , err := ctrl .GetConfig ()
1140+ if err != nil {
1141+ logger .Errorf ("Failed to get kubeconfig for request controller: %v" , err )
1142+ return errors .Wrap (err , "failed to get kubeconfig" )
1143+ }
1144+ kubeConfig .UserAgent = "azure-cns-" + version
1145+
1146+ clientset , err := kubernetes .NewForConfig (kubeConfig )
1147+ if err != nil {
1148+ return errors .Wrap (err , "failed to build clientset" )
1149+ }
1150+
1151+ nodeName , err := configuration .NodeName ()
1152+ if err != nil {
1153+ return errors .Wrap (err , "failed to get NodeName" )
1154+ }
1155+
1156+ node , err := clientset .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
1157+ if err != nil {
1158+ return errors .Wrapf (err , "failed to get node %s" , nodeName )
1159+ }
1160+
1161+ // check the Node labels for Swift V2
1162+ if _ , ok := node .Labels [configuration .LabelNodeSwiftV2 ]; ! ok {
1163+ zlog .Info ("Node is not labeled for Swift V2, skipping polling nodeinfo crd" )
1164+ return nil
1165+ }
1166+
1167+ directcli , err := client .New (kubeConfig , client.Options {Scheme : multitenancy .Scheme })
1168+ if err != nil {
1169+ return errors .Wrap (err , "failed to create ctrl client" )
1170+ }
1171+
1172+ nodeInfoCli := multitenancy.NodeInfoClient {
1173+ Cli : directcli ,
1174+ }
1175+
1176+ ticker := time .NewTicker (defaultNodeInfoCRDPollInterval )
1177+ defer ticker .Stop ()
1178+
1179+ for {
1180+ select {
1181+ case <- ctx .Done ():
1182+ zlog .Info ("Polling context canceled, exiting" )
1183+ return nil
1184+ case <- ticker .C :
1185+ // Fetch the CRD status
1186+ nodeInfo , err := nodeInfoCli .Get (ctx , node .Name )
1187+ if err != nil {
1188+ zlog .Error ("Error fetching nodeinfo CRD" , zap .Error (err ))
1189+ return errors .Wrap (err , "failed to get nodeinfo crd" )
1190+ }
1191+
1192+ // Check if the status is set
1193+ if ! cmp .Equal (nodeInfo .Status , mtv1alpha1.NodeInfoStatus {}) && len (nodeInfo .Status .DeviceInfos ) > 0 {
1194+ // Create a map to count devices by type
1195+ deviceCounts := map [mtv1alpha1.DeviceType ]int {
1196+ mtv1alpha1 .DeviceTypeVnetNIC : 0 ,
1197+ mtv1alpha1 .DeviceTypeInfiniBandNIC : 0 ,
1198+ }
1199+
1200+ // Aggregate device counts from the CRD
1201+ for _ , deviceInfo := range nodeInfo .Status .DeviceInfos {
1202+ switch deviceInfo .DeviceType {
1203+ case mtv1alpha1 .DeviceTypeVnetNIC , mtv1alpha1 .DeviceTypeInfiniBandNIC :
1204+ deviceCounts [deviceInfo .DeviceType ]++
1205+ default :
1206+ zlog .Error ("Unknown device type" , zap .String ("deviceType" , string (deviceInfo .DeviceType )))
1207+ }
1208+ }
1209+
1210+ // Update the plugin manager with device counts
1211+ for deviceType , count := range deviceCounts {
1212+ pluginManager .TrackDevices (deviceType , count )
1213+ }
1214+
1215+ // Exit polling loop once the CRD status is successfully processed
1216+ return nil
1217+ }
1218+ }
1219+ }
1220+ }
1221+
10861222func InitializeMultiTenantController (ctx context.Context , httpRestService cns.HTTPService , cnsconfig configuration.CNSConfig ) error {
10871223 var multiTenantController multitenantcontroller.RequestController
10881224 kubeConfig , err := ctrl .GetConfig ()
0 commit comments