@@ -40,8 +40,25 @@ static bool global_keep_tracking = false;
4040#define PCMK_PROCESS_CHECK_INTERVAL 5
4141
4242static crm_trigger_t * shutdown_trigger = NULL ;
43+ static crm_trigger_t * startup_trigger = NULL ;
4344static const char * pid_file = PCMK_RUN_DIR "/pacemaker.pid" ;
4445
46+ /* state we report when asked via pacemakerd-api status-ping */
47+ static const char * pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT ;
48+ static gboolean running_with_sbd = FALSE; /* local copy */
49+ /* When contacted via pacemakerd-api by a client having sbd in
50+ * the name we assume it is sbd-daemon which wants to know
51+ * if pacemakerd shutdown gracefully.
52+ * Thus when everything is shutdown properly pacemakerd
53+ * waits till it has reported the graceful completion of
54+ * shutdown to sbd and just when sbd-client closes the
55+ * connection we can assume that the report has arrived
56+ * properly so that pacemakerd can finally exit.
57+ * Following two variables are used to track that handshake.
58+ */
59+ static unsigned int shutdown_complete_state_reported_to = 0 ;
60+ static gboolean shutdown_complete_state_reported_client_closed = FALSE;
61+
4562typedef struct pcmk_child_s {
4663 pid_t pid ;
4764 long flag ;
@@ -374,21 +391,20 @@ escalate_shutdown(gpointer data)
374391static gboolean
375392pcmk_shutdown_worker (gpointer user_data )
376393{
377- static int phase = 0 ;
394+ static int phase = SIZEOF ( pcmk_children ) ;
378395 static time_t next_log = 0 ;
379- static int max = SIZEOF (pcmk_children );
380396
381397 int lpc = 0 ;
382398
383- if (phase == 0 ) {
399+ if (phase == SIZEOF ( pcmk_children ) ) {
384400 crm_notice ("Shutting down Pacemaker" );
385- phase = max ;
401+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN ;
386402 }
387403
388404 for (; phase > 0 ; phase -- ) {
389405 /* Don't stop anything with start_seq < 1 */
390406
391- for (lpc = max - 1 ; lpc >= 0 ; lpc -- ) {
407+ for (lpc = SIZEOF ( pcmk_children ) - 1 ; lpc >= 0 ; lpc -- ) {
392408 pcmk_child_t * child = & (pcmk_children [lpc ]);
393409
394410 if (phase != child -> start_seq ) {
@@ -436,6 +452,13 @@ pcmk_shutdown_worker(gpointer user_data)
436452 }
437453
438454 crm_notice ("Shutdown complete" );
455+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE ;
456+ if (!fatal_error && running_with_sbd &&
457+ pcmk__get_sbd_sync_resource_startup () &&
458+ !shutdown_complete_state_reported_client_closed ) {
459+ crm_notice ("Waiting for SBD to pick up shutdown-complete-state." );
460+ return TRUE;
461+ }
439462
440463 {
441464 const char * delay = pcmk__env_option ("shutdown_delay" );
@@ -489,6 +512,55 @@ pcmk_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
489512 return 0 ;
490513}
491514
515+ static void
516+ pcmk_handle_ping_request (pcmk__client_t * c , xmlNode * msg , uint32_t id )
517+ {
518+ const char * value = NULL ;
519+ xmlNode * ping = NULL ;
520+ xmlNode * reply = NULL ;
521+ time_t pinged = time (NULL );
522+ const char * from = crm_element_value (msg , F_CRM_SYS_FROM );
523+
524+ /* Pinged for status */
525+ crm_trace ("Pinged from %s.%s" ,
526+ crm_str (crm_element_value (msg , F_CRM_ORIGIN )),
527+ from ?from :"unknown" );
528+ ping = create_xml_node (NULL , XML_CRM_TAG_PING );
529+ value = crm_element_value (msg , F_CRM_SYS_TO );
530+ crm_xml_add (ping , XML_PING_ATTR_SYSFROM , value );
531+ crm_xml_add (ping , XML_PING_ATTR_PACEMAKERDSTATE , pacemakerd_state );
532+ crm_xml_add_ll (ping , XML_ATTR_TSTAMP , (long long ) pinged );
533+ crm_xml_add (ping , XML_PING_ATTR_STATUS , "ok" );
534+ reply = create_reply (msg , ping );
535+ free_xml (ping );
536+ if (reply ) {
537+ if (pcmk__ipc_send_xml (c , id , reply , crm_ipc_server_event ) !=
538+ pcmk_rc_ok ) {
539+ crm_err ("Failed sending ping-reply" );
540+ }
541+ free_xml (reply );
542+ } else {
543+ crm_err ("Failed building ping-reply" );
544+ }
545+ /* just proceed state on sbd pinging us */
546+ if (from && strstr (from , "sbd" )) {
547+ if (crm_str_eq (pacemakerd_state ,
548+ XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE ,
549+ TRUE)) {
550+ if (pcmk__get_sbd_sync_resource_startup ()) {
551+ crm_notice ("Shutdown-complete-state passed to SBD." );
552+ }
553+ shutdown_complete_state_reported_to = c -> pid ;
554+ } else if (crm_str_eq (pacemakerd_state ,
555+ XML_PING_ATTR_PACEMAKERDSTATE_WAITPING ,
556+ TRUE)) {
557+ crm_notice ("Received startup-trigger from SBD." );
558+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS ;
559+ mainloop_set_trigger (startup_trigger );
560+ }
561+ }
562+ }
563+
492564/* Exit code means? */
493565static int32_t
494566pcmk_ipc_dispatch (qb_ipcs_connection_t * qbc , void * data , size_t size )
@@ -514,6 +586,9 @@ pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size)
514586 crm_trace ("Ignoring IPC request to purge node "
515587 "because peer cache is not used" );
516588
589+ } else if (crm_str_eq (task , CRM_OP_PING , TRUE)) {
590+ pcmk_handle_ping_request (c , msg , id );
591+
517592 } else {
518593 crm_debug ("Unrecognized IPC command '%s' sent to pacemakerd" ,
519594 crm_str (task ));
@@ -533,6 +608,12 @@ pcmk_ipc_closed(qb_ipcs_connection_t * c)
533608 return 0 ;
534609 }
535610 crm_trace ("Connection %p" , c );
611+ if (shutdown_complete_state_reported_to == client -> pid ) {
612+ shutdown_complete_state_reported_client_closed = TRUE;
613+ if (shutdown_trigger ) {
614+ mainloop_set_trigger (shutdown_trigger );
615+ }
616+ }
536617 pcmk__free_client (client );
537618 return 0 ;
538619}
@@ -924,8 +1005,8 @@ find_and_track_existing_processes(void)
9241005 return pcmk_rc_ok ;
9251006}
9261007
927- static void
928- init_children_processes (void )
1008+ static gboolean
1009+ init_children_processes (void * user_data )
9291010{
9301011 int start_seq = 1 , lpc = 0 ;
9311012 static int max = SIZEOF (pcmk_children );
@@ -951,6 +1032,8 @@ init_children_processes(void)
9511032 * This may be useful for the daemons to know
9521033 */
9531034 setenv ("PCMK_respawned" , "true" , 1 );
1035+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_RUNNING ;
1036+ return TRUE;
9541037}
9551038
9561039static void
@@ -1154,6 +1237,7 @@ main(int argc, char **argv)
11541237
11551238 if (pcmk_locate_sbd () > 0 ) {
11561239 setenv ("PCMK_watchdog" , "true" , 1 );
1240+ running_with_sbd = TRUE;
11571241 } else {
11581242 setenv ("PCMK_watchdog" , "false" , 1 );
11591243 }
@@ -1170,7 +1254,19 @@ main(int argc, char **argv)
11701254 mainloop_add_signal (SIGTERM , pcmk_shutdown );
11711255 mainloop_add_signal (SIGINT , pcmk_shutdown );
11721256
1173- init_children_processes ();
1257+ if ((running_with_sbd ) && pcmk__get_sbd_sync_resource_startup ()) {
1258+ crm_notice ("Waiting for startup-trigger from SBD." );
1259+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_WAITPING ;
1260+ startup_trigger = mainloop_add_trigger (G_PRIORITY_HIGH , init_children_processes , NULL );
1261+ } else {
1262+ if (running_with_sbd ) {
1263+ crm_warn ("Enabling SBD_SYNC_RESOURCE_STARTUP would (if supported "
1264+ "by your SBD version) improve reliability of "
1265+ "interworking between SBD & pacemaker." );
1266+ }
1267+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS ;
1268+ init_children_processes (NULL );
1269+ }
11741270
11751271 crm_notice ("Pacemaker daemon successfully started and accepting connections" );
11761272 g_main_loop_run (mainloop );
0 commit comments