4242
4343#include "orte/mca/errmgr/errmgr.h"
4444#include "orte/mca/rmaps/base/base.h"
45+ #include "orte/mca/rml/base/rml_contact.h"
4546#include "orte/mca/state/state.h"
4647#include "orte/util/name_fns.h"
4748#include "orte/util/show_help.h"
@@ -539,7 +540,14 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
539540 int rc , cnt ;
540541 opal_pmix_pdata_t * pdat ;
541542 orte_job_t * jdata ;
542- opal_buffer_t buf ;
543+ orte_node_t * node ;
544+ orte_proc_t * proc ;
545+ opal_buffer_t buf , bucket ;
546+ opal_byte_object_t * bo ;
547+ orte_process_name_t dmn , pname ;
548+ char * uri ;
549+ opal_value_t val ;
550+ opal_list_t nodes ;
543551
544552 ORTE_ACQUIRE_OBJECT (cd );
545553
@@ -556,6 +564,7 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
556564 pdat = (opal_pmix_pdata_t * )opal_list_get_first (data );
557565 if (OPAL_BYTE_OBJECT != pdat -> value .type ) {
558566 rc = ORTE_ERR_BAD_PARAM ;
567+ ORTE_ERROR_LOG (rc );
559568 goto release ;
560569 }
561570 /* the data will consist of a packed buffer with the job data in it */
@@ -565,15 +574,107 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
565574 pdat -> value .data .bo .size = 0 ;
566575 cnt = 1 ;
567576 if (OPAL_SUCCESS != (rc = opal_dss .unpack (& buf , & jdata , & cnt , ORTE_JOB ))) {
577+ ORTE_ERROR_LOG (rc );
578+ OBJ_DESTRUCT (& buf );
579+ goto release ;
580+ }
581+
582+ /* unpack the byte object containing the daemon uri's */
583+ cnt = 1 ;
584+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
585+ ORTE_ERROR_LOG (rc );
568586 OBJ_DESTRUCT (& buf );
569587 goto release ;
570588 }
589+ /* load it into a buffer */
590+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
591+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
592+ bo -> bytes = NULL ;
593+ free (bo );
594+ /* prep a list to save the nodes */
595+ OBJ_CONSTRUCT (& nodes , opal_list_t );
596+ /* unpack and store the URI's */
597+ cnt = 1 ;
598+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & uri , & cnt , OPAL_STRING ))) {
599+ rc = orte_rml_base_parse_uris (uri , & dmn , NULL );
600+ if (ORTE_SUCCESS != rc ) {
601+ OBJ_DESTRUCT (& buf );
602+ OBJ_DESTRUCT (& bucket );
603+ goto release ;
604+ }
605+ /* save a node object for this daemon */
606+ node = OBJ_NEW (orte_node_t );
607+ node -> daemon = OBJ_NEW (orte_proc_t );
608+ memcpy (& node -> daemon -> name , & dmn , sizeof (orte_process_name_t ));
609+ opal_list_append (& nodes , & node -> super );
610+ /* register the URI */
611+ OBJ_CONSTRUCT (& val , opal_value_t );
612+ val .key = OPAL_PMIX_PROC_URI ;
613+ val .type = OPAL_STRING ;
614+ val .data .string = uri ;
615+ if (OPAL_SUCCESS != (rc = opal_pmix .store_local (& dmn , & val ))) {
616+ ORTE_ERROR_LOG (rc );
617+ val .key = NULL ;
618+ val .data .string = NULL ;
619+ OBJ_DESTRUCT (& val );
620+ OBJ_DESTRUCT (& buf );
621+ OBJ_DESTRUCT (& bucket );
622+ goto release ;
623+ }
624+ val .key = NULL ;
625+ val .data .string = NULL ;
626+ OBJ_DESTRUCT (& val );
627+ cnt = 1 ;
628+ }
629+ OBJ_DESTRUCT (& bucket );
630+
631+ /* unpack the proc-to-daemon map */
632+ cnt = 1 ;
633+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
634+ ORTE_ERROR_LOG (rc );
635+ OBJ_DESTRUCT (& buf );
636+ goto release ;
637+ }
638+ /* load it into a buffer */
639+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
640+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
641+ bo -> bytes = NULL ;
642+ free (bo );
643+ /* unpack and store the map */
644+ cnt = 1 ;
645+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & pname , & cnt , ORTE_NAME ))) {
646+ /* get the name of the daemon hosting it */
647+ if (OPAL_SUCCESS != (rc = opal_dss .unpack (& bucket , & dmn , & cnt , ORTE_NAME ))) {
648+ OBJ_DESTRUCT (& buf );
649+ OBJ_DESTRUCT (& bucket );
650+ goto release ;
651+ }
652+ /* create the proc object */
653+ proc = OBJ_NEW (orte_proc_t );
654+ memcpy (& proc -> name , & pname , sizeof (orte_process_name_t ));
655+ opal_pointer_array_set_item (jdata -> procs , pname .vpid , proc );
656+ /* find the daemon */
657+ OPAL_LIST_FOREACH (node , & nodes , orte_node_t ) {
658+ if (node -> daemon -> name .vpid == dmn .vpid ) {
659+ OBJ_RETAIN (node );
660+ proc -> node = node ;
661+ break ;
662+ }
663+ }
664+ }
665+ OBJ_DESTRUCT (& bucket );
666+ OPAL_LIST_DESTRUCT (& nodes );
571667 OBJ_DESTRUCT (& buf );
668+
669+ /* register the nspace */
572670 if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
671+ ORTE_ERROR_LOG (rc );
573672 OBJ_RELEASE (jdata );
574673 goto release ;
575674 }
576- OBJ_RELEASE (jdata ); // no reason to keep this around
675+
676+ /* save the job object so we don't endlessly cycle */
677+ opal_hash_table_set_value_uint32 (orte_job_data , jdata -> jobid , jdata );
577678
578679 /* restart the cnct processor */
579680 ORTE_PMIX_OPERATION (cd -> procs , cd -> info , _cnct , cd -> cbfunc , cd -> cbdata );
@@ -619,6 +720,7 @@ static void _cnct(int sd, short args, void *cbdata)
619720 * out about it, and all we can do is return an error */
620721 if (orte_pmix_server_globals .server .jobid == ORTE_PROC_MY_HNP -> jobid &&
621722 orte_pmix_server_globals .server .vpid == ORTE_PROC_MY_HNP -> vpid ) {
723+ ORTE_ERROR_LOG (ORTE_ERR_NOT_SUPPORTED );
622724 rc = ORTE_ERR_NOT_SUPPORTED ;
623725 goto release ;
624726 }
@@ -634,6 +736,7 @@ static void _cnct(int sd, short args, void *cbdata)
634736 kv -> data .uint32 = geteuid ();
635737 opal_list_append (cd -> info , & kv -> super );
636738 if (ORTE_SUCCESS != (rc = pmix_server_lookup_fn (& nm -> name , keys , cd -> info , _cnlk , cd ))) {
739+ ORTE_ERROR_LOG (rc );
637740 opal_argv_free (keys );
638741 goto release ;
639742 }
@@ -647,6 +750,7 @@ static void _cnct(int sd, short args, void *cbdata)
647750 if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_NSPACE_REGISTERED , NULL , OPAL_BOOL )) {
648751 /* it hasn't been registered yet, so register it now */
649752 if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
753+ ORTE_ERROR_LOG (rc );
650754 goto release ;
651755 }
652756 }
0 commit comments