@@ -376,17 +376,13 @@ static void send_msg(int fd, short args, void *cbdata)
376376 uint32_t total_packets ;
377377 fi_addr_t dest_fi_addr ;
378378 orte_rml_send_t * snd ;
379- orte_rml_recv_t * rcv ;
380- orte_self_send_xfer_t * xfer ;
381379 orte_rml_ofi_request_t * ofi_send_req = OBJ_NEW ( orte_rml_ofi_request_t );
382380 uint8_t ofi_prov_id = req -> ofi_prov_id ;
383381 orte_rml_ofi_send_pkt_t * ofi_msg_pkt ;
384382 size_t datalen_per_pkt , hdrsize , data_in_pkt ; // the length of data in per packet excluding the header size
385383 orte_rml_ofi_peer_t * pr ;
386384 uint64_t ui64 ;
387385 struct sockaddr_in * ep_sockaddr ;
388- int i , bytes ;
389- char * ptr ;
390386
391387 snd = OBJ_NEW (orte_rml_send_t );
392388 snd -> dst = * peer ;
@@ -408,85 +404,59 @@ static void send_msg(int fd, short args, void *cbdata)
408404 ORTE_NAME_PRINT (peer ), tag );
409405
410406
411- /* get the peer address by doing modex_receive */
407+ /* get the peer address from our internal hash table */
408+ opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
409+ "%s getting contact info for DAEMON peer %s from internal hash table" ,
410+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (peer ));
411+ memcpy (& ui64 , (char * )peer , sizeof (uint64_t ));
412+ if (OPAL_SUCCESS != (ret = opal_hash_table_get_value_uint64 (& orte_rml_ofi .peers ,
413+ ui64 , (void * * )& pr ) || NULL == pr )) {
414+ opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
415+ "%s rml:ofi: Send failed to get peer OFI contact info " ,
416+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
417+ snd -> status = ORTE_ERR_ADDRESSEE_UNKNOWN ;
418+ ORTE_RML_SEND_COMPLETE (snd );
419+ //OBJ_RELEASE( ofi_send_req);
420+ return ;
421+ }
422+ opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
423+ "%s rml:ofi: OFI peer contact info got from hash table" ,
424+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
425+ dest_ep_name = pr -> ofi_ep ;
426+ dest_ep_namelen = pr -> ofi_ep_len ;
427+
428+ //[Debug] printing additional info of IP
429+ switch ( orte_rml_ofi .ofi_prov [ofi_prov_id ].fabric_info -> addr_format )
430+ {
431+ case FI_SOCKADDR_IN :
432+ /* Address is of type sockaddr_in (IPv4) */
433+ /*[debug] - print the sockaddr - port and s_addr */
434+ ep_sockaddr = (struct sockaddr_in * )dest_ep_name ;
435+ opal_output_verbose (1 ,orte_rml_base_framework .framework_output ,
436+ "%s peer %s epnamelen is %lu, port = %d (or) 0x%x, InternetAddr = 0x%s " ,
437+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ORTE_NAME_PRINT (peer ),
438+ (unsigned long )orte_rml_ofi .ofi_prov [ofi_prov_id ].epnamelen ,ntohs (ep_sockaddr -> sin_port ),
439+ ntohs (ep_sockaddr -> sin_port ),inet_ntoa (ep_sockaddr -> sin_addr ));
440+ /*[end debug]*/
441+ break ;
442+ }
443+ //[Debug] end debug
412444 opal_output_verbose (10 , orte_rml_base_framework .framework_output ,
413- "%s calling OPAL_MODEX_RECV_STRING " , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ) );
414- if (ORTE_PROC_IS_APP ) {
415- asprintf (& pmix_key ,"%s%d" ,orte_rml_ofi .ofi_prov [ofi_prov_id ].fabric_info -> fabric_attr -> prov_name ,ofi_prov_id );
416- opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
417- "%s calling OPAL_MODEX_RECV_STRING for ORTE_PROC_APP peer - %s, key - %s " ,
418- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (peer ),pmix_key );
419- OPAL_MODEX_RECV_STRING (ret , pmix_key , peer , (uint8_t * * ) & dest_ep_name , & dest_ep_namelen );
420- opal_output_verbose (10 , orte_rml_base_framework .framework_output , "Returned from MODEX_RECV" );
421- opal_output_verbose (50 , orte_rml_base_framework .framework_output ,
422- "%s Return value from OPAL_MODEX_RECV_STRING - %d, length returned - %lu" ,
423- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ret , dest_ep_namelen );
424- free (pmix_key );
425- } else {
445+ "%s OPAL_MODEX_RECV succeded, %s peer ep name obtained. length=%lu" ,
446+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
447+ ORTE_NAME_PRINT (peer ), dest_ep_namelen );
448+ ret = fi_av_insert (orte_rml_ofi .ofi_prov [ofi_prov_id ].av , dest_ep_name ,1 ,& dest_fi_addr ,0 ,NULL );
449+ if ( ret != 1 ) {
426450 opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
427- "%s calling OPAL_MODEX_RECV_STRING for DAEMON peer %s" ,
428- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (peer ));
429- memcpy (& ui64 , (char * )peer , sizeof (uint64_t ));
430- if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (& orte_rml_ofi .peers ,
431- ui64 , (void * * )& pr ) || NULL == pr ) {
432- opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
433- "%s rml:ofi: Send failed to get peer OFI contact info " ,
434- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
435- return ;
436- }
437- opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
438- "%s rml:ofi: OFI peer contact info got from hash table" ,
439- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
440- dest_ep_name = pr -> ofi_ep ;
441- dest_ep_namelen = pr -> ofi_ep_len ;
442- ret = OPAL_SUCCESS ;
443- }
444- if ( OPAL_SUCCESS == ret ) {
445- //[Debug] printing additional info of IP
446- switch ( orte_rml_ofi .ofi_prov [ofi_prov_id ].fabric_info -> addr_format )
447- {
448- case FI_SOCKADDR_IN :
449- /* Address is of type sockaddr_in (IPv4) */
450- /*[debug] - print the sockaddr - port and s_addr */
451- ep_sockaddr = (struct sockaddr_in * )dest_ep_name ;
452- opal_output_verbose (1 ,orte_rml_base_framework .framework_output ,
453- "%s peer %s epnamelen is %d, port = %d (or) 0x%x, InternetAddr = 0x%s " ,
454- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ORTE_NAME_PRINT (peer ),
455- orte_rml_ofi .ofi_prov [ofi_prov_id ].epnamelen ,ntohs (ep_sockaddr -> sin_port ),
456- ntohs (ep_sockaddr -> sin_port ),inet_ntoa (ep_sockaddr -> sin_addr ));
457- /*[end debug]*/
458- break ;
459- }
460- //[Debug] end debug
461- opal_output_verbose (10 , orte_rml_base_framework .framework_output ,
462- "%s OPAL_MODEX_RECV succeded, %s peer ep name obtained. length=%lu" ,
463- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
464- ORTE_NAME_PRINT (peer ), dest_ep_namelen );
465- ret = fi_av_insert (orte_rml_ofi .ofi_prov [ofi_prov_id ].av , dest_ep_name ,1 ,& dest_fi_addr ,0 ,NULL );
466- if ( ret != 1 ) {
467- opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
468- "%s fi_av_insert failed in send_msg() returned %d" ,
469- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ret );
470- /* call the send-callback fn with error and return, also return failure status */
471- snd -> status = ORTE_ERR_ADDRESSEE_UNKNOWN ;
451+ "%s fi_av_insert failed in send_msg() returned %d" ,
452+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),ret );
453+ /* call the send-callback fn with error and return, also return failure status */
454+ snd -> status = ORTE_ERR_ADDRESSEE_UNKNOWN ;
472455
473- ORTE_RML_SEND_COMPLETE (snd );
474-
475- return ;
476- }
477- } else {
478-
479- opal_output_verbose (1 , orte_rml_base_framework .framework_output ,
480- "%s OPAL_MODEX_RECV failed to obtain %s peer ep name " ,
481- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
482- ORTE_NAME_PRINT (peer ));
483- /* call the send-callback fn with error and return, also return failure status */
484- snd -> status = ORTE_ERR_ADDRESSEE_UNKNOWN ;
485456 ORTE_RML_SEND_COMPLETE (snd );
486- //OBJ_RELEASE( ofi_send_req);
457+
487458 return ;
488459 }
489-
490460 ofi_send_req -> send = snd ;
491461 ofi_send_req -> completion_count = 1 ;
492462
@@ -625,7 +595,6 @@ int orte_rml_ofi_send_nb(struct orte_rml_base_module_t* mod,
625595 void * cbdata )
626596{
627597 orte_rml_recv_t * rcv ;
628- orte_rml_send_t * snd ;
629598 int bytes ;
630599 orte_self_send_xfer_t * xfer ;
631600 int i ;
@@ -749,7 +718,6 @@ int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod,
749718 void * cbdata )
750719{
751720 orte_rml_recv_t * rcv ;
752- orte_rml_send_t * snd ;
753721 orte_self_send_xfer_t * xfer ;
754722 ofi_send_request_t * req ;
755723 orte_rml_ofi_module_t * ofi_mod = (orte_rml_ofi_module_t * )mod ;
0 commit comments