1515#include "osc_rdma_dynamic.h"
1616
1717#include "ompi/mca/osc/base/osc_base_obj_convert.h"
18+ #include "opal/align.h"
19+
20+ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t * sync , ompi_osc_rdma_peer_t * peer , uint64_t source_address ,
21+ mca_btl_base_registration_handle_t * source_handle , void * target_buffer , size_t size ,
22+ ompi_osc_rdma_request_t * request );
1823
1924static void ompi_osc_get_data_complete (struct mca_btl_base_module_t * btl , struct mca_btl_base_endpoint_t * endpoint ,
2025 void * local_address , mca_btl_base_registration_handle_t * local_handle ,
@@ -136,7 +141,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
136141 ompi_osc_rdma_peer_t * peer , uint64_t remote_address ,
137142 mca_btl_base_registration_handle_t * remote_handle , int remote_count ,
138143 ompi_datatype_t * remote_datatype , ompi_osc_rdma_request_t * request , const size_t max_rdma_len ,
139- const ompi_osc_rdma_fn_t rdma_fn ,const bool alloc_reqs )
144+ const ompi_osc_rdma_fn_t rdma_fn , const bool alloc_reqs )
140145{
141146 ompi_osc_rdma_module_t * module = sync -> module ;
142147 struct iovec local_iovec [OMPI_OSC_RDMA_DECODE_MAX ], remote_iovec [OMPI_OSC_RDMA_DECODE_MAX ];
@@ -575,11 +580,13 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc
575580
576581 assert (OPAL_SUCCESS == status );
577582
578- if (NULL != frag ) {
583+ if (request -> buffer || NULL != frag ) {
579584 if (OPAL_LIKELY (OMPI_SUCCESS == status )) {
580585 memcpy (origin_addr , (void * ) source , request -> len );
581586 }
587+ }
582588
589+ if (NULL != frag ) {
583590 ompi_osc_rdma_frag_complete (frag );
584591 } else {
585592 ompi_osc_rdma_deregister (sync -> module , local_handle );
@@ -621,6 +628,27 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
621628
622629}
623630
631+ static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t * sync , ompi_osc_rdma_peer_t * peer , uint64_t source_address ,
632+ mca_btl_base_registration_handle_t * source_handle , void * target_buffer , size_t size ,
633+ ompi_osc_rdma_request_t * request ) {
634+ ompi_osc_rdma_module_t * module = sync -> module ;
635+ ompi_osc_rdma_request_t * subreq ;
636+ int ret ;
637+
638+ OMPI_OSC_RDMA_REQUEST_ALLOC (module , peer , subreq );
639+ subreq -> internal = true;
640+ subreq -> type = OMPI_OSC_RDMA_TYPE_RDMA ;
641+ subreq -> parent_request = request ;
642+ (void ) OPAL_THREAD_ADD32 (& request -> outstanding_requests , 1 );
643+
644+ ret = ompi_osc_rdma_get_contig (sync , peer , source_address , source_handle , target_buffer , size , subreq );
645+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
646+ OMPI_OSC_RDMA_REQUEST_RETURN (subreq );
647+ (void ) OPAL_THREAD_ADD32 (& request -> outstanding_requests , -1 );
648+ }
649+
650+ return ret ;
651+ }
624652
625653static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t * sync , ompi_osc_rdma_peer_t * peer , uint64_t source_address ,
626654 mca_btl_base_registration_handle_t * source_handle , void * target_buffer , size_t size ,
@@ -639,33 +667,81 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
639667 aligned_source_bound = (source_address + size + btl_alignment_mask ) & ~btl_alignment_mask ;
640668 aligned_len = aligned_source_bound - aligned_source_base ;
641669
642- request -> offset = source_address - aligned_source_base ;
643- request -> len = size ;
644- request -> origin_addr = target_buffer ;
645- request -> sync = sync ;
646-
647670 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p" ,
648671 size , source_address , target_buffer );
649672
650673 if ((module -> selected_btl -> btl_register_mem && size > module -> selected_btl -> btl_get_local_registration_threshold ) ||
651674 (((uint64_t ) target_buffer | size | source_address ) & btl_alignment_mask )) {
675+
652676 ret = ompi_osc_rdma_frag_alloc (module , aligned_len , & frag , & ptr );
653677 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
654- /* check for alignment */
655- if (!(((uint64_t ) target_buffer | size | source_address ) & btl_alignment_mask )) {
656- (void ) ompi_osc_rdma_register (module , peer -> data_endpoint , target_buffer , size , MCA_BTL_REG_FLAG_LOCAL_WRITE ,
678+ if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret ) {
679+ /* region is too large for a buffered read */
680+ size_t subsize ;
681+
682+ if ((source_address & btl_alignment_mask ) && (source_address & btl_alignment_mask ) == ((intptr_t ) target_buffer & btl_alignment_mask )) {
683+ /* remote region has the same alignment but the base is not aligned. perform a small
684+ * buffered get of the beginning of the remote region */
685+ aligned_source_base = OPAL_ALIGN (source_address , module -> selected_btl -> btl_get_alignment , osc_rdma_base_t );
686+ subsize = (size_t ) (aligned_source_base - source_address );
687+
688+ ret = ompi_osc_rdma_get_partial (sync , peer , source_address , source_handle , target_buffer , subsize , request );
689+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
690+ return ret ;
691+ }
692+
693+ source_address += subsize ;
694+ target_buffer = (void * ) ((intptr_t ) target_buffer + subsize );
695+ size -= subsize ;
696+
697+ aligned_len = aligned_source_bound - aligned_source_base ;
698+ }
699+
700+ if (!(((uint64_t ) target_buffer | source_address ) & btl_alignment_mask ) &&
701+ (size & btl_alignment_mask )) {
702+ /* remote region bases are aligned but the bounds are not. perform a
703+ * small buffered get of the end of the remote region */
704+ aligned_len = size & ~btl_alignment_mask ;
705+ subsize = size - aligned_len ;
706+ size = aligned_len ;
707+ ret = ompi_osc_rdma_get_partial (sync , peer , source_address + aligned_len , source_handle ,
708+ (void * ) ((intptr_t ) target_buffer + aligned_len ), subsize , request );
709+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
710+ return ret ;
711+ }
712+ }
713+ /* (remaining) user request is now correctly aligned */
714+ }
715+
716+ if ((((uint64_t ) target_buffer | size | source_address ) & btl_alignment_mask )) {
717+ /* local and remote alignments differ */
718+ request -> buffer = ptr = malloc (aligned_len );
719+ } else {
720+ ptr = target_buffer ;
721+ }
722+
723+ if (NULL != ptr ) {
724+ (void ) ompi_osc_rdma_register (module , peer -> data_endpoint , ptr , aligned_len , MCA_BTL_REG_FLAG_LOCAL_WRITE ,
657725 & local_handle );
658726 }
659727
660728 if (OPAL_UNLIKELY (NULL == local_handle )) {
661- return OMPI_ERR_OUT_OF_RESOURCE ;
729+ free (request -> buffer );
730+ request -> buffer = NULL ;
731+ return ret ;
662732 }
663733 } else {
664- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "using internal buffer %p in fragment %p for get" , ptr , (void * ) frag );
734+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx" ,
735+ ptr , (void * ) frag , aligned_len , (unsigned long ) aligned_source_base );
665736 local_handle = frag -> handle ;
666737 }
667738 }
668739
740+ request -> offset = source_address - aligned_source_base ;
741+ request -> len = size ;
742+ request -> origin_addr = target_buffer ;
743+ request -> sync = sync ;
744+
669745 ompi_osc_rdma_sync_rdma_inc (sync );
670746
671747 do {
0 commit comments