Merge pull request #8569 from AboorvaDevarajan/fix_perf_regression

awlauria · web-flow · commit 5585b0348946 · 2021-03-11T17:26:25.000-05:00
ompi/datatype: Fix performance regression in reduce collective
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
@@ -110,7 +110,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
     *(COUNT) -= cando_count;
 
     if(_elem->blocklen < 9) {
-        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem)))   {
+        if((!(CONVERTOR->flags & CONVERTOR_CUDA)) && OPAL_LIKELY(OPAL_SUCCESS ==
+                    opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem)))   {
             goto update_and_return;
         }
         /* else unrecognized _elem->common.type, use the memcpy path */
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -256,26 +256,6 @@ opal_datatype_unpack_predefined_element( unsigned char** rtn_src,
     unsigned char *src = *rtn_src;
     unsigned char *dest = *rtn_dest;
 
-#if OPAL_CUDA_SUPPORT
-    if (opal_cuda_check_bufs(dest, src)) {
-        return OPAL_ERROR;
-    }
-/*
- *  For checking if elem contains cuda memory, I think it's mostly okay
- *  to only check the first element as done above.  Although a complete
- *  MPI datatype could easily be made to span both gpu and system memory,
- *  I don't think that's true for the individual vector elements that make
- *  up a datatype's description.  The only way I can even conceive of that
- *  being untrue is if the element has only two entries with a crazy
- *  extent sized to hit both locations.  I don't really think that's
- *  possible, but I'm checking it anyway below.
- */
-    if (elem->count == 2 && cando_count >= blocklen &&
-       (opal_cuda_check_bufs(dest + elem->extent, src)))
-    {
-        return OPAL_ERROR;
-    }
-#endif
   if ((uintptr_t)src % align  ||
       (uintptr_t)dest % align ||
       (elem->extent % align && cando_count > blocklen))
@@ -415,16 +395,6 @@ opal_datatype_pack_predefined_element( unsigned char** rtn_src,
     unsigned char *src = *rtn_src;
     unsigned char *dest = *rtn_dest;
 
-#if OPAL_CUDA_SUPPORT
-    if (opal_cuda_check_bufs(dest, src)) {
-        return OPAL_ERROR;
-    }
-    if (elem->count == 2 && cando_count >= blocklen &&
-       (opal_cuda_check_bufs(dest, src + elem->extent)))
-    {
-        return OPAL_ERROR;
-    }
-#endif
   if ((uintptr_t)src % align  ||
       (uintptr_t)dest % align ||
       (elem->extent % align && cando_count > blocklen))
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
@@ -106,7 +106,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
     *(COUNT) -= cando_count;
 
     if( _elem->blocklen < 9 ) {
-        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {
+        if((!(CONVERTOR->flags & CONVERTOR_CUDA)) && OPAL_LIKELY(OPAL_SUCCESS ==
+               opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {
             goto update_and_return;
         }
         /* else unrecognized _elem->common.type, use the memcpy path */

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,`
`110`	`110`	`*(COUNT) -= cando_count;`
`111`	`111`
`112`	`112`	`if(_elem->blocklen < 9) {`
`113`		`- if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem))) {`
	`113`	`+ if((!(CONVERTOR->flags & CONVERTOR_CUDA)) && OPAL_LIKELY(OPAL_SUCCESS ==`
	`114`	`+ opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem))) {`
`114`	`115`	`goto update_and_return;`
`115`	`116`	`}`
`116`	`117`	`/* else unrecognized _elem->common.type, use the memcpy path */`
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,`
`106`	`106`	`*(COUNT) -= cando_count;`
`107`	`107`
`108`	`108`	`if( _elem->blocklen < 9 ) {`
`109`		`- if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {`
	`109`	`+ if((!(CONVERTOR->flags & CONVERTOR_CUDA)) && OPAL_LIKELY(OPAL_SUCCESS ==`
	`110`	`+ opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {`
`110`	`111`	`goto update_and_return;`
`111`	`112`	`}`
`112`	`113`	`/* else unrecognized _elem->common.type, use the memcpy path */`