@@ -342,7 +342,7 @@ public void start() {
342342 logger .info ("Attempted to connect to the server, but received an unexpected exception, trying again..." , e );
343343 }
344344 }
345- shell .updateConnectedHost ();
345+ shell .updateConnectedHost ((( NioClient ) connection ). getHost () );
346346 scavengeOldAgentObjects ();
347347 }
348348
@@ -617,15 +617,11 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
617617 }
618618
619619 protected void reconnect (final Link link ) {
620- reconnect (link , null , null , false );
620+ reconnect (link , null , false );
621621 }
622622
623- protected void reconnect (final Link link , String preferredHost , List < String > avoidHostList , boolean forTransfer ) {
623+ protected void reconnect (final Link link , String preferredMSHost , boolean forTransfer ) {
624624 if (!(forTransfer || reconnectAllowed )) {
625- return ;
626- }
627-
628- if (!reconnectAllowed ) {
629625 logger .debug ("Reconnect requested but it is not allowed {}" , () -> getLinkLog (link ));
630626 return ;
631627 }
@@ -637,19 +633,26 @@ protected void reconnect(final Link link, String preferredHost, List<String> avo
637633 serverResource .disconnected ();
638634 logger .info ("Lost connection to host: {}. Attempting reconnection while we still have {} commands in progress." , shell .getConnectedHost (), commandsInProgress .get ());
639635 stopAndCleanupConnection (true );
636+ String host = preferredMSHost ;
637+ if (org .apache .commons .lang3 .StringUtils .isBlank (host )) {
638+ host = shell .getNextHost ();
639+ }
640+ List <String > avoidMSHostList = shell .getAvoidHosts ();
640641 do {
641- final String host = shell .getNextHost ();
642- connection = new NioClient (getAgentName (), host , shell .getPort (), shell .getWorkers (), shell .getSslHandshakeTimeout (), this );
643- logger .info ("Reconnecting to host: {}" , host );
644- try {
645- connection .start ();
646- } catch (final NioConnectionException e ) {
647- logger .info ("Attempted to re-connect to the server, but received an unexpected exception, trying again..." , e );
648- stopAndCleanupConnection (false );
642+ if (CollectionUtils .isEmpty (avoidMSHostList ) || !avoidMSHostList .contains (host )) {
643+ connection = new NioClient (getAgentName (), host , shell .getPort (), shell .getWorkers (), shell .getSslHandshakeTimeout (), this );
644+ logger .info ("Reconnecting to host: {}" , host );
645+ try {
646+ connection .start ();
647+ } catch (final NioConnectionException e ) {
648+ logger .info ("Attempted to re-connect to the server, but received an unexpected exception, trying again..." , e );
649+ stopAndCleanupConnection (false );
650+ }
649651 }
650652 shell .getBackoffAlgorithm ().waitBeforeRetry ();
653+ host = shell .getNextHost ();
651654 } while (!connection .isStartup ());
652- shell .updateConnectedHost ();
655+ shell .updateConnectedHost ((( NioClient ) connection ). getHost () );
653656 logger .info ("Connected to the host: {}" , shell .getConnectedHost ());
654657 }
655658
@@ -925,7 +928,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
925928 return new SetupCertificateAnswer (true );
926929 }
927930
928- private void processManagementServerList (final List <String > msList , final String lbAlgorithm , final Long lbCheckInterval ) {
931+ private void processManagementServerList (final List <String > msList , final List < String > avoidMsList , final String lbAlgorithm , final Long lbCheckInterval ) {
929932 if (CollectionUtils .isNotEmpty (msList ) && StringUtils .isNotEmpty (lbAlgorithm )) {
930933 try {
931934 final String newMSHosts = String .format ("%s%s%s" , com .cloud .utils .StringUtils .toCSVList (msList ), IAgentShell .hostLbAlgorithmSeparator , lbAlgorithm );
@@ -937,6 +940,7 @@ private void processManagementServerList(final List<String> msList, final String
937940 throw new CloudRuntimeException ("Could not persist received management servers list" , e );
938941 }
939942 }
943+ shell .setAvoidHosts (avoidMsList );
940944 if ("shuffle" .equals (lbAlgorithm )) {
941945 scheduleHostLBCheckerTask (0 );
942946 } else {
@@ -945,16 +949,18 @@ private void processManagementServerList(final List<String> msList, final String
945949 }
946950
947951 private Answer setupManagementServerList (final SetupMSListCommand cmd ) {
948- processManagementServerList (cmd .getMsList (), cmd .getLbAlgorithm (), cmd .getLbCheckInterval ());
952+ processManagementServerList (cmd .getMsList (), cmd .getAvoidMsList (), cmd . getLbAlgorithm (), cmd .getLbCheckInterval ());
949953 return new SetupMSListAnswer (true );
950954 }
951955
952956 private Answer migrateAgentToOtherMS (final MigrateAgentConnectionCommand cmd ) {
953957 try {
954958 if (CollectionUtils .isNotEmpty (cmd .getMsList ())) {
955- processManagementServerList (cmd .getMsList (), cmd .getLbAlgorithm (), cmd .getLbCheckInterval ());
959+ processManagementServerList (cmd .getMsList (), cmd .getAvoidMsList (), cmd . getLbAlgorithm (), cmd .getLbCheckInterval ());
956960 }
957- migrateAgentConnection (cmd .getAvoidMsList ());
961+ Executors .newSingleThreadScheduledExecutor (new NamedThreadFactory ("MigrateAgentConnection-Job" )).schedule (() -> {
962+ migrateAgentConnection (cmd .getAvoidMsList ());
963+ }, 3 , TimeUnit .SECONDS );
958964 } catch (Exception e ) {
959965 String errMsg = "Migrate agent connection failed, due to " + e .getMessage ();
960966 logger .debug (errMsg , e );
@@ -975,25 +981,26 @@ private void migrateAgentConnection(List<String> avoidMsList) {
975981 throw new CloudRuntimeException ("No other Management Server hosts to migrate" );
976982 }
977983
978- String preferredHost = null ;
984+ String preferredMSHost = null ;
979985 for (String msHost : msHostsList ) {
980986 try (final Socket socket = new Socket ()) {
981987 socket .connect (new InetSocketAddress (msHost , shell .getPort ()), 5000 );
982- preferredHost = msHost ;
988+ preferredMSHost = msHost ;
983989 break ;
984990 } catch (final IOException e ) {
985991 throw new CloudRuntimeException ("Management server host: " + msHost + " is not reachable, to migrate connection" );
986992 }
987993 }
988994
989- if (preferredHost == null ) {
995+ if (preferredMSHost == null ) {
990996 throw new CloudRuntimeException ("Management server host(s) are not reachable, to migrate connection" );
991997 }
992998
993- logger .debug ("Management server host " + preferredHost + " is found to be reachable, trying to reconnect" );
999+ logger .debug ("Management server host " + preferredMSHost + " is found to be reachable, trying to reconnect" );
9941000 shell .resetHostCounter ();
1001+ shell .setAvoidHosts (avoidMsList );
9951002 shell .setConnectionTransfer (true );
996- reconnect (link , preferredHost , avoidMsList , true );
1003+ reconnect (link , preferredMSHost , true );
9971004 }
9981005
9991006 public void processResponse (final Response response , final Link link ) {
@@ -1007,17 +1014,22 @@ public void processResponse(final Response response, final Link link) {
10071014 listener .processControlResponse (response , (AgentControlAnswer )answer );
10081015 }
10091016 } else if (answer instanceof PingAnswer ) {
1010- if (((PingAnswer ) answer ).isSendStartup () && reconnectAllowed ) {
1011- logger .info ("Management server requested startup command to reinitialize the agent" );
1012- sendStartup (link );
1013- } else {
1014- serverResource .processPingAnswer ((PingAnswer ) answer );
1015- }
1017+ processPingAnswer ((PingAnswer ) answer );
10161018 } else {
10171019 updateLastPingResponseTime ();
10181020 }
10191021 }
10201022
1023+ private void processPingAnswer (final PingAnswer answer ) {
1024+ if ((answer .isSendStartup ()) && reconnectAllowed ) {
1025+ logger .info ("Management server requested startup command to reinitialize the agent" );
1026+ sendStartup (link );
1027+ } else {
1028+ serverResource .processPingAnswer ((PingAnswer ) answer );
1029+ }
1030+ shell .setAvoidHosts (answer .getAvoidMsList ());
1031+ }
1032+
10211033 public void processReadyCommand (final Command cmd ) {
10221034 final ReadyCommand ready = (ReadyCommand )cmd ;
10231035 // Set human readable sizes;
@@ -1034,7 +1046,7 @@ public void processReadyCommand(final Command cmd) {
10341046 }
10351047
10361048 verifyAgentArch (ready .getArch ());
1037- processManagementServerList (ready .getMsHostList (), ready .getLbAlgorithm (), ready .getLbCheckInterval ());
1049+ processManagementServerList (ready .getMsHostList (), ready .getAvoidMsHostList (), ready . getLbAlgorithm (), ready .getLbCheckInterval ());
10381050
10391051 logger .info ("Ready command is processed for agent [id: {}, uuid: {}, name: {}]" , getId (), getUuid (), getName ());
10401052 }
@@ -1384,26 +1396,26 @@ protected void runInContext() {
13841396 if (msList == null || msList .length < 1 ) {
13851397 return ;
13861398 }
1387- final String preferredHost = msList [0 ];
1399+ final String preferredMSHost = msList [0 ];
13881400 final String connectedHost = shell .getConnectedHost ();
13891401 logger .debug ("Running preferred host checker task, connected host={}, preferred host={}" ,
1390- connectedHost , preferredHost );
1391- if (preferredHost == null || preferredHost .equals (connectedHost ) || link == null ) {
1402+ connectedHost , preferredMSHost );
1403+ if (preferredMSHost == null || preferredMSHost .equals (connectedHost ) || link == null ) {
13921404 return ;
13931405 }
13941406 boolean isHostUp = false ;
13951407 try (final Socket socket = new Socket ()) {
1396- socket .connect (new InetSocketAddress (preferredHost , shell .getPort ()), 5000 );
1408+ socket .connect (new InetSocketAddress (preferredMSHost , shell .getPort ()), 5000 );
13971409 isHostUp = true ;
13981410 } catch (final IOException e ) {
1399- logger .debug ("Host: {} is not reachable" , preferredHost );
1411+ logger .debug ("Host: {} is not reachable" , preferredMSHost );
14001412 }
14011413 if (isHostUp && link != null && commandsInProgress .get () == 0 ) {
14021414 if (logger .isDebugEnabled ()) {
1403- logger .debug ("Preferred host {} is found to be reachable, trying to reconnect" , preferredHost );
1415+ logger .debug ("Preferred host {} is found to be reachable, trying to reconnect" , preferredMSHost );
14041416 }
14051417 shell .resetHostCounter ();
1406- reconnect (link );
1418+ reconnect (link , preferredMSHost , false );
14071419 }
14081420 } catch (Throwable t ) {
14091421 logger .error ("Error caught while attempting to connect to preferred host" , t );
0 commit comments