@@ -144,6 +144,7 @@ static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_con
144144 udreg_reg -> rcache = reg_context ;
145145 udreg_reg -> base = OPAL_DOWN_ALIGN_PTR (addr , page_size , unsigned char * );
146146 udreg_reg -> bound = OPAL_ALIGN_PTR ((intptr_t ) addr + size , page_size , unsigned char * ) - 1 ;
147+ udreg_reg -> ref_count = 0 ;
147148
148149 addr = (void * ) udreg_reg -> base ;
149150 size = (uint64_t ) (udreg_reg -> bound - udreg_reg -> base + 1 );
@@ -161,7 +162,8 @@ static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_con
161162 opal_output_verbose (MCA_BASE_VERBOSE_WARN , opal_rcache_base_framework .framework_output ,
162163 "rcache/udreg: could not register memory. rc: %d" , rc );
163164 opal_free_list_return (& rcache_udreg -> reg_list , item );
164- udreg_reg = NULL ;
165+ /* NTH: this is the only way to get UDReg_Register to recognize a failure */
166+ udreg_reg = UDREG_DEVICE_REG_FAILED ;
165167 }
166168
167169 return udreg_reg ;
@@ -173,13 +175,9 @@ static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_conte
173175 mca_rcache_base_registration_t * udreg_reg = (mca_rcache_base_registration_t * ) device_data ;
174176 int rc ;
175177
176- if (udreg_reg -> ref_count ) {
177- /* there are still users of this registration. leave it alone */
178- return 0 ;
179- }
178+ assert (udreg_reg -> ref_count == 0 );
180179
181180 rc = rcache_udreg -> resources .base .deregister_mem (rcache_udreg -> resources .base .reg_data , udreg_reg );
182-
183181 if (OPAL_LIKELY (OPAL_SUCCESS == rc )) {
184182 opal_free_list_return (& rcache_udreg -> reg_list ,
185183 (opal_free_list_item_t * ) udreg_reg );
@@ -208,8 +206,9 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
208206 mca_rcache_udreg_module_t * rcache_udreg = (mca_rcache_udreg_module_t * ) rcache ;
209207 mca_rcache_base_registration_t * udreg_reg , * old_reg ;
210208 bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS );
211- udreg_entry_t * udreg_entry ;
212- udreg_return_t urc ;
209+ const unsigned int page_size = opal_getpagesize ();
210+ unsigned char * base , * bound ;
211+ udreg_entry_t * udreg_entry = NULL ;
213212
214213 * reg = NULL ;
215214
@@ -219,60 +218,76 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
219218 rcache_udreg -> requested_access_flags = access_flags ;
220219 rcache_udreg -> requested_flags = flags ;
221220
221+ base = OPAL_DOWN_ALIGN_PTR (addr , page_size , unsigned char * );
222+ bound = OPAL_ALIGN_PTR ((intptr_t ) addr + size , page_size , unsigned char * ) - 1 ;
223+
224+ addr = base ;
225+ size = (size_t ) (uintptr_t ) (bound - base ) + 1 ;
226+
222227 if (false == bypass_cache ) {
223228 /* Get a udreg entry for this region */
224229 do {
225230 opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_rcache_base_framework .framework_output ,
226- "rcache/udreg: registering region {%p, %p} with udreg" , addr , (void * )((intptr_t ) addr + size ));
227- while (UDREG_RC_SUCCESS !=
228- (urc = UDREG_Register (rcache_udreg -> udreg_handle , addr , size , & udreg_entry ))) {
231+ "rcache/udreg: XXX registering region {%p, %p} with udreg" , addr , (void * )((intptr_t ) addr + size ));
232+ while (UDREG_RC_SUCCESS != UDREG_Register (rcache_udreg -> udreg_handle , addr , size , & udreg_entry )) {
229233 /* try to remove one unused reg and retry */
234+ opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_rcache_base_framework .framework_output ,
235+ "calling evict!" );
230236 if (!mca_rcache_udreg_evict (rcache )) {
231237 opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_rcache_base_framework .framework_output ,
232- "rcache/udreg: could not register memory with udreg. udreg rc: %d" , urc );
238+ "rcache/udreg: could not register memory with udreg" );
233239 OPAL_THREAD_UNLOCK (& rcache_udreg -> lock );
234240 return OPAL_ERR_OUT_OF_RESOURCE ;
235241 }
236242 }
237243
238244 udreg_reg = (mca_rcache_base_registration_t * ) udreg_entry -> device_data ;
239-
240- if ((udreg_reg -> access_flags & access_flags ) == access_flags ) {
245+ if (NULL != udreg_reg && (udreg_reg -> access_flags & access_flags ) == access_flags ) {
241246 /* sufficient access */
242247 break ;
243248 }
244249
245250 old_reg = udreg_reg ;
246251
247- /* to not confuse udreg make sure the new registration covers the same address
248- * range as the old one. */
249- addr = old_reg -> base ;
250- size = (size_t )((intptr_t ) old_reg -> bound - (intptr_t ) old_reg -> base );
252+ if (old_reg ) {
253+ /* to not confuse udreg make sure the new registration covers the same address
254+ * range as the old one. */
255+ addr = old_reg -> base ;
256+ size = (size_t )((intptr_t ) old_reg -> bound - (intptr_t ) old_reg -> base );
257+
258+ /* make the new access flags more permissive */
259+ access_flags |= old_reg -> access_flags ;
260+
261+ if (!old_reg -> ref_count ) {
262+ /* deregister the region before attempting to re-register */
263+ mca_rcache_udreg_dereg_func (old_reg , rcache );
264+ udreg_entry -> device_data = NULL ;
265+ old_reg = NULL ;
266+ } else {
267+ /* ensure that mca_rcache_udreg_deregister does not call into udreg since
268+ * we are forcefully evicting the registration here */
269+ old_reg -> flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID ;
270+ }
271+ }
251272
252- /* make the new access flags more permissive */
253- rcache_udreg -> requested_access_flags = access_flags | old_reg -> access_flags ;
273+ rcache_udreg -> requested_access_flags = access_flags ;
254274
255275 /* get a new registration */
256- udreg_reg = mca_rcache_udreg_reg_func (addr , size , rcache );
257- if (NULL == udreg_reg ) {
258- OPAL_THREAD_UNLOCK (& rcache_udreg -> lock );
259- return OPAL_ERR_OUT_OF_RESOURCE ;
276+ while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr , size , rcache ))) {
277+ if (!mca_rcache_udreg_evict (rcache )) {
278+ opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_rcache_base_framework .framework_output ,
279+ "rcache/udreg: could not register memory with udreg" );
280+ OPAL_THREAD_UNLOCK (& rcache_udreg -> lock );
281+ return OPAL_ERR_OUT_OF_RESOURCE ;
282+ }
260283 }
261284
262285 /* update the device data with the new registration */
263286 udreg_entry -> device_data = udreg_reg ;
264-
265- /* ensure that mca_rcache_udreg_deregister does not call into udreg since
266- * we are forcefully evicting the registration here */
267- old_reg -> flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID ;
268-
269- mca_rcache_udreg_dereg_func (old_reg , rcache );
270287 } while (0 );
271-
272- udreg_reg -> rcache_context = udreg_entry ;
273288 } else {
274289 /* if cache bypass is requested don't use the udreg cache */
275- while (NULL == (udreg_reg = mca_rcache_udreg_reg_func (addr , size , rcache ))) {
290+ while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr , size , rcache ))) {
276291 /* try to remove one unused reg and retry */
277292 if (!mca_rcache_udreg_evict (rcache )) {
278293 opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_rcache_base_framework .framework_output ,
@@ -281,13 +296,13 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
281296 return OPAL_ERR_OUT_OF_RESOURCE ;
282297 }
283298 }
284- udreg_reg -> rcache_context = NULL ;
285299 }
286300
287301 OPAL_THREAD_UNLOCK (& rcache_udreg -> lock );
288302
289303 * reg = udreg_reg ;
290304 ++ udreg_reg -> ref_count ;
305+ udreg_reg -> rcache_context = udreg_entry ;
291306
292307 return OPAL_SUCCESS ;
293308}
@@ -312,7 +327,7 @@ static int mca_rcache_udreg_deregister(mca_rcache_base_module_t *rcache,
312327 OPAL_THREAD_LOCK (& rcache_udreg -> lock );
313328 UDREG_DecrRefcount (rcache_udreg -> udreg_handle , reg -> rcache_context );
314329 OPAL_THREAD_UNLOCK (& rcache_udreg -> lock );
315- } else {
330+ } else if (! reg -> ref_count ) {
316331 mca_rcache_udreg_dereg_func (reg , rcache );
317332 }
318333
0 commit comments