@@ -342,7 +342,7 @@ public void start() {
342342 logger .info ("Attempted to connect to the server, but received an unexpected exception, trying again..." , e );
343343 }
344344 }
345- shell .updateConnectedHost ();
345+ shell .updateConnectedHost ((( NioClient ) connection ). getHost () );
346346 scavengeOldAgentObjects ();
347347 }
348348
@@ -617,15 +617,11 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
617617 }
618618
619619 protected void reconnect (final Link link ) {
620- reconnect (link , null , null , false );
620+ reconnect (link , null , false );
621621 }
622622
623- protected void reconnect (final Link link , String preferredHost , List < String > avoidHostList , boolean forTransfer ) {
623+ protected void reconnect (final Link link , String preferredMSHost , boolean forTransfer ) {
624624 if (!(forTransfer || reconnectAllowed )) {
625- return ;
626- }
627-
628- if (!reconnectAllowed ) {
629625 logger .debug ("Reconnect requested but it is not allowed {}" , () -> getLinkLog (link ));
630626 return ;
631627 }
@@ -637,19 +633,26 @@ protected void reconnect(final Link link, String preferredHost, List<String> avo
637633 serverResource .disconnected ();
638634 logger .info ("Lost connection to host: {}. Attempting reconnection while we still have {} commands in progress." , shell .getConnectedHost (), commandsInProgress .get ());
639635 stopAndCleanupConnection (true );
636+ String host = preferredMSHost ;
637+ if (org .apache .commons .lang3 .StringUtils .isBlank (host )) {
638+ host = shell .getNextHost ();
639+ }
640+ List <String > avoidMSHostList = shell .getAvoidHosts ();
640641 do {
641- final String host = shell .getNextHost ();
642- connection = new NioClient (getAgentName (), host , shell .getPort (), shell .getWorkers (), shell .getSslHandshakeTimeout (), this );
643- logger .info ("Reconnecting to host: {}" , host );
644- try {
645- connection .start ();
646- } catch (final NioConnectionException e ) {
647- logger .info ("Attempted to re-connect to the server, but received an unexpected exception, trying again..." , e );
648- stopAndCleanupConnection (false );
642+ if (CollectionUtils .isEmpty (avoidMSHostList ) || !avoidMSHostList .contains (host )) {
643+ connection = new NioClient (getAgentName (), host , shell .getPort (), shell .getWorkers (), shell .getSslHandshakeTimeout (), this );
644+ logger .info ("Reconnecting to host: {}" , host );
645+ try {
646+ connection .start ();
647+ } catch (final NioConnectionException e ) {
648+ logger .info ("Attempted to re-connect to the server, but received an unexpected exception, trying again..." , e );
649+ stopAndCleanupConnection (false );
650+ }
649651 }
650652 shell .getBackoffAlgorithm ().waitBeforeRetry ();
653+ host = shell .getNextHost ();
651654 } while (!connection .isStartup ());
652- shell .updateConnectedHost ();
655+ shell .updateConnectedHost ((( NioClient ) connection ). getHost () );
653656 logger .info ("Connected to the host: {}" , shell .getConnectedHost ());
654657 }
655658
@@ -922,7 +925,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
922925 return new SetupCertificateAnswer (true );
923926 }
924927
925- private void processManagementServerList (final List <String > msList , final String lbAlgorithm , final Long lbCheckInterval ) {
928+ private void processManagementServerList (final List <String > msList , final List < String > avoidMsList , final String lbAlgorithm , final Long lbCheckInterval ) {
926929 if (CollectionUtils .isNotEmpty (msList ) && StringUtils .isNotEmpty (lbAlgorithm )) {
927930 try {
928931 final String newMSHosts = String .format ("%s%s%s" , com .cloud .utils .StringUtils .toCSVList (msList ), IAgentShell .hostLbAlgorithmSeparator , lbAlgorithm );
@@ -934,6 +937,7 @@ private void processManagementServerList(final List<String> msList, final String
934937 throw new CloudRuntimeException ("Could not persist received management servers list" , e );
935938 }
936939 }
940+ shell .setAvoidHosts (avoidMsList );
937941 if ("shuffle" .equals (lbAlgorithm )) {
938942 scheduleHostLBCheckerTask (0 );
939943 } else {
@@ -942,16 +946,18 @@ private void processManagementServerList(final List<String> msList, final String
942946 }
943947
944948 private Answer setupManagementServerList (final SetupMSListCommand cmd ) {
945- processManagementServerList (cmd .getMsList (), cmd .getLbAlgorithm (), cmd .getLbCheckInterval ());
949+ processManagementServerList (cmd .getMsList (), cmd .getAvoidMsList (), cmd . getLbAlgorithm (), cmd .getLbCheckInterval ());
946950 return new SetupMSListAnswer (true );
947951 }
948952
949953 private Answer migrateAgentToOtherMS (final MigrateAgentConnectionCommand cmd ) {
950954 try {
951955 if (CollectionUtils .isNotEmpty (cmd .getMsList ())) {
952- processManagementServerList (cmd .getMsList (), cmd .getLbAlgorithm (), cmd .getLbCheckInterval ());
956+ processManagementServerList (cmd .getMsList (), cmd .getAvoidMsList (), cmd . getLbAlgorithm (), cmd .getLbCheckInterval ());
953957 }
954- migrateAgentConnection (cmd .getAvoidMsList ());
958+ Executors .newSingleThreadScheduledExecutor (new NamedThreadFactory ("MigrateAgentConnection-Job" )).schedule (() -> {
959+ migrateAgentConnection (cmd .getAvoidMsList ());
960+ }, 3 , TimeUnit .SECONDS );
955961 } catch (Exception e ) {
956962 String errMsg = "Migrate agent connection failed, due to " + e .getMessage ();
957963 logger .debug (errMsg , e );
@@ -972,25 +978,26 @@ private void migrateAgentConnection(List<String> avoidMsList) {
972978 throw new CloudRuntimeException ("No other Management Server hosts to migrate" );
973979 }
974980
975- String preferredHost = null ;
981+ String preferredMSHost = null ;
976982 for (String msHost : msHostsList ) {
977983 try (final Socket socket = new Socket ()) {
978984 socket .connect (new InetSocketAddress (msHost , shell .getPort ()), 5000 );
979- preferredHost = msHost ;
985+ preferredMSHost = msHost ;
980986 break ;
981987 } catch (final IOException e ) {
982988 throw new CloudRuntimeException ("Management server host: " + msHost + " is not reachable, to migrate connection" );
983989 }
984990 }
985991
986- if (preferredHost == null ) {
992+ if (preferredMSHost == null ) {
987993 throw new CloudRuntimeException ("Management server host(s) are not reachable, to migrate connection" );
988994 }
989995
990- logger .debug ("Management server host " + preferredHost + " is found to be reachable, trying to reconnect" );
996+ logger .debug ("Management server host " + preferredMSHost + " is found to be reachable, trying to reconnect" );
991997 shell .resetHostCounter ();
998+ shell .setAvoidHosts (avoidMsList );
992999 shell .setConnectionTransfer (true );
993- reconnect (link , preferredHost , avoidMsList , true );
1000+ reconnect (link , preferredMSHost , true );
9941001 }
9951002
9961003 public void processResponse (final Response response , final Link link ) {
@@ -1003,14 +1010,21 @@ public void processResponse(final Response response, final Link link) {
10031010 for (final IAgentControlListener listener : controlListeners ) {
10041011 listener .processControlResponse (response , (AgentControlAnswer )answer );
10051012 }
1006- } else if (answer instanceof PingAnswer && (((PingAnswer ) answer ).isSendStartup ()) && reconnectAllowed ) {
1007- logger .info ("Management server requested startup command to reinitialize the agent" );
1008- sendStartup (link );
1013+ } else if (answer instanceof PingAnswer ) {
1014+ processPingAnswer ((PingAnswer ) answer );
10091015 } else {
10101016 updateLastPingResponseTime ();
10111017 }
10121018 }
10131019
1020+ private void processPingAnswer (final PingAnswer answer ) {
1021+ if ((answer .isSendStartup ()) && reconnectAllowed ) {
1022+ logger .info ("Management server requested startup command to reinitialize the agent" );
1023+ sendStartup (link );
1024+ }
1025+ shell .setAvoidHosts (answer .getAvoidMsList ());
1026+ }
1027+
10141028 public void processReadyCommand (final Command cmd ) {
10151029 final ReadyCommand ready = (ReadyCommand )cmd ;
10161030 // Set human readable sizes;
@@ -1027,7 +1041,7 @@ public void processReadyCommand(final Command cmd) {
10271041 }
10281042
10291043 verifyAgentArch (ready .getArch ());
1030- processManagementServerList (ready .getMsHostList (), ready .getLbAlgorithm (), ready .getLbCheckInterval ());
1044+ processManagementServerList (ready .getMsHostList (), ready .getAvoidMsHostList (), ready . getLbAlgorithm (), ready .getLbCheckInterval ());
10311045
10321046 logger .info ("Ready command is processed for agent [id: {}, uuid: {}, name: {}]" , getId (), getUuid (), getName ());
10331047 }
@@ -1374,26 +1388,26 @@ protected void runInContext() {
13741388 if (msList == null || msList .length < 1 ) {
13751389 return ;
13761390 }
1377- final String preferredHost = msList [0 ];
1391+ final String preferredMSHost = msList [0 ];
13781392 final String connectedHost = shell .getConnectedHost ();
13791393 logger .debug ("Running preferred host checker task, connected host={}, preferred host={}" ,
1380- connectedHost , preferredHost );
1381- if (preferredHost == null || preferredHost .equals (connectedHost ) || link == null ) {
1394+ connectedHost , preferredMSHost );
1395+ if (preferredMSHost == null || preferredMSHost .equals (connectedHost ) || link == null ) {
13821396 return ;
13831397 }
13841398 boolean isHostUp = false ;
13851399 try (final Socket socket = new Socket ()) {
1386- socket .connect (new InetSocketAddress (preferredHost , shell .getPort ()), 5000 );
1400+ socket .connect (new InetSocketAddress (preferredMSHost , shell .getPort ()), 5000 );
13871401 isHostUp = true ;
13881402 } catch (final IOException e ) {
1389- logger .debug ("Host: {} is not reachable" , preferredHost );
1403+ logger .debug ("Host: {} is not reachable" , preferredMSHost );
13901404 }
13911405 if (isHostUp && link != null && commandsInProgress .get () == 0 ) {
13921406 if (logger .isDebugEnabled ()) {
1393- logger .debug ("Preferred host {} is found to be reachable, trying to reconnect" , preferredHost );
1407+ logger .debug ("Preferred host {} is found to be reachable, trying to reconnect" , preferredMSHost );
13941408 }
13951409 shell .resetHostCounter ();
1396- reconnect (link );
1410+ reconnect (link , preferredMSHost , false );
13971411 }
13981412 } catch (Throwable t ) {
13991413 logger .error ("Error caught while attempting to connect to preferred host" , t );
0 commit comments