@@ -1749,6 +1749,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
17491749    void  * attrdata [] =  {(void  * )& memType , (void  * )& ctx , (void  * )& isManaged };
17501750
17511751    res  =  cuFunc .cuPointerGetAttributes (3 , attributes , attrdata , dbuf );
1752+     OPAL_OUTPUT_VERBOSE ((101 , mca_common_cuda_output , 
1753+ 			"dbuf=%p, memType=%d, ctx=%p, isManaged=%d, res=%d" ,
1754+ 			 (void  * )dbuf , (int )memType , (void  * )ctx , isManaged , res ));
17521755
17531756    /* Mark unified memory buffers with a flag.  This will allow all unified 
17541757     * memory to be forced through host buffers.  Note that this memory can 
@@ -1822,13 +1825,34 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
18221825        }
18231826    }
18241827
1828+     /* WORKAROUND - They are times when the above code determines a pice of memory 
1829+      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems 
1830+      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we 
1831+      * made it this far, then the assumption at this point is we have GPU memory. 
1832+      * Unfotunately, this extra call is costing us another 100 ns almost doubling 
1833+      * the cost of this entire function. */ 
1834+     {
1835+         CUdeviceptr  pbase ;
1836+         size_t  psize ;
1837+         res  =  cuFunc .cuMemGetAddressRange (& pbase , & psize , dbuf );
1838+         if  (CUDA_SUCCESS  !=  res ) {
1839+             opal_output_verbose (5 , mca_common_cuda_output , 
1840+                                 "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p " 
1841+                                 "Overriding check and setting to host pointer. " ,
1842+                               res , (void  * )dbuf );
1843+             /* This cannot be GPU memory if the previous call failed */ 
1844+             return  0 ;
1845+         }
1846+     }
1847+ 
18251848    /* First access on a device pointer finalizes CUDA support initialization. 
18261849     * If initialization fails, disable support. */ 
18271850    if  (!stage_three_init_complete ) {
18281851        if  (0  !=  mca_common_cuda_stage_three_init ()) {
18291852            opal_cuda_support  =  0 ;
18301853        }
18311854    }
1855+ 
18321856    return  1 ;
18331857}
18341858
0 commit comments