@@ -820,8 +820,8 @@ type ncStateReconciler interface {
820820}
821821
822822// TODO(rbtr) where should this live??
823- // InitCNS initializes cns by passing pods and a createnetworkcontainerrequest
824- func initCNS (ctx context.Context , cli nodeNetworkConfigGetter , ncReconciler ncStateReconciler , podInfoByIPProvider cns.PodInfoByIPProvider ) error {
823+ // reconcileInitialCNSState initializes cns by passing pods and a CreateNetworkContainerRequest
824+ func reconcileInitialCNSState (ctx context.Context , cli nodeNetworkConfigGetter , ncReconciler ncStateReconciler , podInfoByIPProvider cns.PodInfoByIPProvider ) error {
825825 // Get nnc using direct client
826826 nnc , err := cli .Get (ctx )
827827 if err != nil {
@@ -864,8 +864,6 @@ func initCNS(ctx context.Context, cli nodeNetworkConfigGetter, ncReconciler ncSt
864864
865865// InitializeCRDState builds and starts the CRD controllers.
866866func InitializeCRDState (ctx context.Context , httpRestService cns.HTTPService , cnsconfig * configuration.CNSConfig ) error {
867- logger .Printf ("[Azure CNS] Starting request controller" )
868-
869867 // convert interface type to implementation type
870868 httpRestServiceImplementation , ok := httpRestService .(* restserver.HTTPRestService )
871869 if ! ok {
@@ -880,40 +878,23 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
880878 }
881879 httpRestServiceImplementation .SetNodeOrchestrator (& orchestrator )
882880
881+ // build default clientset.
883882 kubeConfig , err := ctrl .GetConfig ()
884883 kubeConfig .UserAgent = fmt .Sprintf ("azure-cns-%s" , version )
885884 if err != nil {
886885 logger .Errorf ("[Azure CNS] Failed to get kubeconfig for request controller: %v" , err )
887886 return err
888887 }
889- nnccli , err := nodenetworkconfig . NewClient (kubeConfig )
888+ clientset , err := kubernetes . NewForConfig (kubeConfig )
890889 if err != nil {
891- return errors .Wrap (err , "failed to create NNC client " )
890+ return errors .Wrap (err , "failed to build clientset " )
892891 }
892+
893+ // get nodename for scoping kube requests to node.
893894 nodeName , err := configuration .NodeName ()
894895 if err != nil {
895896 return errors .Wrap (err , "failed to get NodeName" )
896897 }
897- // TODO(rbtr): nodename and namespace should be in the cns config
898- scopedcli := kubecontroller .NewScopedClient (nnccli , types.NamespacedName {Namespace : "kube-system" , Name : nodeName })
899-
900- // initialize the ipam pool monitor
901- poolOpts := ipampool.Options {
902- RefreshDelay : poolIPAMRefreshRateInMilliseconds * time .Millisecond ,
903- }
904- poolMonitor := ipampool .NewMonitor (httpRestServiceImplementation , scopedcli , & poolOpts )
905- httpRestServiceImplementation .IPAMPoolMonitor = poolMonitor
906- logger .Printf ("Starting IPAM Pool Monitor" )
907- go func () {
908- if e := poolMonitor .Start (ctx ); e != nil {
909- logger .Errorf ("[Azure CNS] Failed to start pool monitor with err: %v" , e )
910- }
911- }()
912-
913- clientset , err := kubernetes .NewForConfig (kubeConfig )
914- if err != nil {
915- return errors .Wrap (err , "failed to build clientset" )
916- }
917898
918899 var podInfoByIPProvider cns.PodInfoByIPProvider
919900 if cnsconfig .InitializeFromCNI {
@@ -939,19 +920,49 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
939920 })
940921 }
941922
923+ // create scoped kube clients.
924+ nnccli , err := nodenetworkconfig .NewClient (kubeConfig )
925+ if err != nil {
926+ return errors .Wrap (err , "failed to create NNC client" )
927+ }
928+ // TODO(rbtr): nodename and namespace should be in the cns config
929+ scopedcli := kubecontroller .NewScopedClient (nnccli , types.NamespacedName {Namespace : "kube-system" , Name : nodeName })
930+
931+ // initialize the ipam pool monitor
932+ poolOpts := ipampool.Options {
933+ RefreshDelay : poolIPAMRefreshRateInMilliseconds * time .Millisecond ,
934+ }
935+ poolMonitor := ipampool .NewMonitor (httpRestServiceImplementation , scopedcli , & poolOpts )
936+ httpRestServiceImplementation .IPAMPoolMonitor = poolMonitor
937+
938+ // reconcile initial CNS state from CNI or apiserver.
942939 // apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for
943940 // aks addons to come up so retry a bit more aggresively here.
944941 // will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
942+ attempt := 0
945943 err = retry .Do (func () error {
946- err = initCNS (ctx , scopedcli , httpRestServiceImplementation , podInfoByIPProvider )
944+ attempt ++
945+ logger .Printf ("reconciling initial CNS state attempt: %d" , attempt )
946+ err = reconcileInitialCNSState (ctx , scopedcli , httpRestServiceImplementation , podInfoByIPProvider )
947947 if err != nil {
948- logger .Errorf ("[Azure CNS] Failed to init cns with err: %v" , err )
948+ logger .Errorf ("failed to reconcile initial CNS state, attempt: %d err: %v" , attempt , err )
949949 }
950950 return errors .Wrap (err , "failed to initialize CNS state" )
951951 }, retry .Context (ctx ), retry .Delay (initCNSInitalDelay ), retry .MaxDelay (time .Minute ))
952952 if err != nil {
953953 return err
954954 }
955+ logger .Printf ("reconciled initial CNS state after %d attempts" , attempt )
956+
957+ // start the pool Monitor before the Reconciler, since it needs to be ready to receive an
958+ // NodeNetworkConfig update by the time the Reconciler tries to send it.
959+ go func () {
960+ logger .Printf ("Starting IPAM Pool Monitor" )
961+ if e := poolMonitor .Start (ctx ); e != nil {
962+ logger .Errorf ("[Azure CNS] Failed to start pool monitor with err: %v" , e )
963+ }
964+ }()
965+ logger .Printf ("initialized and started IPAM pool monitor" )
955966
956967 // the nodeScopedCache sets Selector options on the Manager cache which are used
957968 // to perform *server-side* filtering of the cached objects. This is very important
@@ -982,31 +993,42 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
982993 return errors .Wrapf (err , "failed to get node %s" , nodeName )
983994 }
984995
985- reconciler := kubecontroller .NewReconciler (nnccli , httpRestServiceImplementation , httpRestServiceImplementation . IPAMPoolMonitor )
996+ reconciler := kubecontroller .NewReconciler (nnccli , httpRestServiceImplementation , poolMonitor )
986997 // pass Node to the Reconciler for Controller xref
987998 if err := reconciler .SetupWithManager (manager , node ); err != nil {
988999 return errors .Wrapf (err , "failed to setup reconciler with manager" )
9891000 }
9901001
991- // Start the RequestController which starts the reconcile loop
1002+ // Start the Manager which starts the reconcile loop.
1003+ // The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
1004+ // Monitor's internal loop.
9921005 go func () {
1006+ logger .Printf ("Starting NodeNetworkConfig reconciler." )
9931007 for {
9941008 if err := manager .Start (ctx ); err != nil {
9951009 logger .Errorf ("[Azure CNS] Failed to start request controller: %v" , err )
9961010 // retry to start the request controller
9971011 // todo: add a CNS metric to count # of failures
9981012 } else {
999- logger .Printf ("[Azure CNS] Exiting RequestController " )
1013+ logger .Printf ("exiting NodeNetworkConfig reconciler " )
10001014 return
10011015 }
10021016
10031017 // Retry after 1sec
10041018 time .Sleep (time .Second )
10051019 }
10061020 }()
1021+ logger .Printf ("initialized NodeNetworkConfig reconciler" )
1022+ // wait for up to 10m for the Reconciler to run once.
1023+ timedCtx , cancel := context .WithTimeout (ctx , 10 * time .Minute ) //nolint:gomnd // default 10m
1024+ defer cancel ()
1025+ if started := reconciler .Started (timedCtx ); ! started {
1026+ return errors .Errorf ("timed out waiting for reconciler start" )
1027+ }
1028+ logger .Printf ("started NodeNetworkConfig reconciler" )
10071029
1008- logger .Printf ("Starting SyncHostNCVersion" )
10091030 go func () {
1031+ logger .Printf ("starting SyncHostNCVersion loop" )
10101032 // Periodically poll vfp programmed NC version from NMAgent
10111033 tickerChannel := time .Tick (time .Duration (cnsconfig .SyncHostNCVersionIntervalMs ) * time .Millisecond )
10121034 for {
@@ -1016,10 +1038,12 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
10161038 httpRestServiceImplementation .SyncHostNCVersion (timedCtx , cnsconfig .ChannelMode )
10171039 cancel ()
10181040 case <- ctx .Done ():
1041+ logger .Printf ("exiting SyncHostNCVersion" )
10191042 return
10201043 }
10211044 }
10221045 }()
1046+ logger .Printf ("initialized and started SyncHostNCVersion loop" )
10231047
10241048 return nil
10251049}
0 commit comments