@@ -1787,20 +1787,18 @@ convert_type (void *dst, int dst_type, int dst_kind, void *src, int src_type,
1787
1787
}
1788
1788
1789
1789
static void
1790
- convert_with_strides (void * dst , int dst_type , int dst_kind , ptrdiff_t dst_stride ,
1791
- void * src , int src_type , int src_kind , ptrdiff_t src_stride ,
1790
+ convert_with_strides (void * dst , int dst_type , int dst_kind , ptrdiff_t byte_dst_stride ,
1791
+ void * src , int src_type , int src_kind , ptrdiff_t byte_src_stride ,
1792
1792
size_t num , int * stat )
1793
1793
{
1794
1794
/* Compute the step from one item to convert to the next in bytes. The stride
1795
1795
* is expected to be the one or similar to the array.stride, i.e. *_stride is
1796
1796
* expected to be >= 1 to progress from one item to the next. */
1797
- dst_stride = dst_stride * dst_kind ;
1798
- src_stride = src_stride * src_kind ;
1799
1797
for (size_t i = 0 ; i < num ; ++ i )
1800
1798
{
1801
1799
convert_type (dst , dst_type , dst_kind , src , src_type , src_kind , stat );
1802
- dst += dst_stride ;
1803
- src += src_stride ;
1800
+ dst += byte_dst_stride ;
1801
+ src += byte_src_stride ;
1804
1802
}
1805
1803
}
1806
1804
@@ -1867,8 +1865,10 @@ copy_to_self (gfc_descriptor_t *src, int src_kind,
1867
1865
/* When the rank is 0 then a scalar is copied to a vector and the stride
1868
1866
* is zero. */
1869
1867
convert_with_strides (dest -> base_addr , GFC_DESCRIPTOR_TYPE (dest ), dst_kind ,
1870
- 1 , src -> base_addr , GFC_DESCRIPTOR_TYPE (src ), src_kind ,
1871
- GFC_DESCRIPTOR_RANK (src ) > 0 , size , stat );
1868
+ GFC_DTYPE_TYPE_SIZE (dest ), src -> base_addr ,
1869
+ GFC_DESCRIPTOR_TYPE (src ), src_kind ,
1870
+ GFC_DESCRIPTOR_RANK (src ) > 0 ? GFC_DTYPE_TYPE_SIZE (src )
1871
+ : 0 , size , stat );
1872
1872
}
1873
1873
1874
1874
/* token: The token of the array to be written to. */
@@ -2079,17 +2079,17 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2079
2079
dst_type = GFC_DESCRIPTOR_TYPE (dest );
2080
2080
const bool src_contiguous = PREFIX (is_contiguous ) (src ),
2081
2081
dst_contiguous = PREFIX (is_contiguous ) (dest );
2082
+ const bool same_image = caf_this_image == image_index ,
2083
+ same_type_and_kind = dst_type == src_type && dst_kind == src_kind ;
2082
2084
2083
2085
MPI_Win * p = TOKEN (token );
2084
2086
ptrdiff_t dst_offset = 0 ;
2085
2087
void * pad_str = NULL ;
2086
2088
bool free_pad_str = false;
2087
2089
void * t_buff = NULL ;
2088
2090
bool free_t_buff = false;
2089
- bool * buff_map = NULL ;
2090
2091
const bool dest_char_array_is_longer
2091
- = dst_type == BT_CHARACTER && dst_size > src_size
2092
- && caf_this_image != image_index ;
2092
+ = dst_type == BT_CHARACTER && dst_size > src_size && !same_image ;
2093
2093
const int remote_image = image_index - 1 ;
2094
2094
2095
2095
/* Ensure stat is always set. */
@@ -2108,6 +2108,8 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2108
2108
if (size == 0 )
2109
2109
return ;
2110
2110
2111
+ dprint ("%d/%d: %s() dst_vector = %p, image_index = %d.\n" , caf_this_image , caf_num_images ,
2112
+ __FUNCTION__ , dst_vector , image_index );
2111
2113
check_image_health (image_index , stat );
2112
2114
2113
2115
/* For char arrays: create the padding array, when dst is longer than src. */
@@ -2118,7 +2120,11 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2118
2120
/* For big arrays alloca() may not be able to get the memory on the stack.
2119
2121
* Use a regular malloc then. */
2120
2122
if ((free_pad_str = ((pad_str = alloca (pad_sz )) == NULL )))
2121
- pad_str = malloc (pad_sz );
2123
+ {
2124
+ pad_str = malloc (pad_sz );
2125
+ if (t_buff == NULL )
2126
+ caf_runtime_error ("Unable to allocate memory for internal buffer in send()." );
2127
+ }
2122
2128
if (dst_kind == 1 )
2123
2129
memset (pad_str , ' ' , pad_num );
2124
2130
else /* dst_kind == 4. */
@@ -2131,7 +2137,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2131
2137
if (src_contiguous && dst_contiguous
2132
2138
&& dst_vector == NULL )
2133
2139
{
2134
- if (caf_this_image == image_index )
2140
+ if (same_image )
2135
2141
{
2136
2142
dprint ("%d/%d: %s() in caf_this == image_index\n" ,
2137
2143
caf_this_image , caf_num_images , __FUNCTION__ );
@@ -2149,9 +2155,13 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2149
2155
if (dst_kind != src_kind || dest_char_array_is_longer
2150
2156
|| src_rank == 0 )
2151
2157
if ((free_t_buff = ((t_buff = alloca (dst_size * size )) == NULL )))
2152
- t_buff = malloc (dst_size * size );
2158
+ {
2159
+ t_buff = malloc (dst_size * size );
2160
+ if (t_buff == NULL )
2161
+ caf_runtime_error ("Unable to allocate memory for internal buffer in send()." );
2162
+ }
2153
2163
2154
- if ((dst_type == src_type && dst_kind == src_kind && dst_rank == src_rank )
2164
+ if ((same_type_and_kind && dst_rank == src_rank )
2155
2165
|| dst_type == BT_CHARACTER )
2156
2166
{
2157
2167
if (dest_char_array_is_longer
@@ -2173,9 +2183,9 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2173
2183
}
2174
2184
else
2175
2185
{
2176
- convert_with_strides (t_buff , dst_type , dst_kind , 1 ,
2186
+ convert_with_strides (t_buff , dst_type , dst_kind , dst_size ,
2177
2187
src -> base_addr , src_type , src_kind ,
2178
- src_rank > 0 ,
2188
+ src_rank > 0 ? src_size : 0 ,
2179
2189
size , stat );
2180
2190
ierr = MPI_Put (t_buff , dst_size * size , MPI_BYTE , remote_image ,
2181
2191
offset , dst_size * size , MPI_BYTE , * p );
@@ -2204,8 +2214,6 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2204
2214
#else
2205
2215
MPI_Win_flush (remote_image , * p );
2206
2216
#endif // CAF_MPI_LOCK_UNLOCK
2207
- if (free_t_buff )
2208
- free (t_buff );
2209
2217
}
2210
2218
}
2211
2219
else
@@ -2319,77 +2327,111 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2319
2327
MPI_Type_free (& dt_d );
2320
2328
2321
2329
#else
2322
- if (caf_this_image == image_index && mrt )
2330
+ if (same_image && mrt )
2323
2331
{
2324
- t_buff = calloc (size ,GFC_DESCRIPTOR_SIZE (dest ));
2325
- buff_map = calloc (size ,sizeof (bool ));
2332
+ if ((free_t_buff = (((t_buff = alloca (dst_size * size ))) == NULL )))
2333
+ {
2334
+ t_buff = malloc (dst_size * size );
2335
+ if (t_buff == NULL )
2336
+ caf_runtime_error ("Unable to allocate memory for internal buffer in send()." );
2337
+ }
2338
+ }
2339
+ else if (!same_type_and_kind && !same_image )
2340
+ {
2341
+ if ((free_t_buff = (((t_buff = alloca (dst_size ))) == NULL )))
2342
+ {
2343
+ t_buff = malloc (dst_size );
2344
+ if (t_buff == NULL )
2345
+ caf_runtime_error ("Unable to allocate memory for internal buffer in send()." );
2346
+ }
2326
2347
}
2327
2348
2328
- CAF_Win_lock (MPI_LOCK_EXCLUSIVE , image_index - 1 , * p );
2329
- for (i = 0 ; i < size ; i ++ )
2349
+ if (!same_image )
2350
+ CAF_Win_lock (MPI_LOCK_EXCLUSIVE , remote_image , * p );
2351
+ for (i = 0 ; i < size ; ++ i )
2330
2352
{
2331
2353
ptrdiff_t array_offset_dst = 0 ;
2332
- ptrdiff_t stride = 1 ;
2333
2354
ptrdiff_t extent = 1 ;
2334
- ptrdiff_t tot_ext = 1 ;
2335
- for (j = 0 ; j < dst_rank - 1 ; j ++ )
2355
+ ptrdiff_t tot_ext = 1 ;
2356
+ for (j = 0 ; j < dst_rank - 1 ; ++ j )
2336
2357
{
2337
2358
array_offset_dst += ((i / tot_ext )
2338
2359
% (dest -> dim [j ]._ubound
2339
2360
- dest -> dim [j ].lower_bound + 1 ))
2340
2361
* dest -> dim [j ]._stride ;
2341
2362
extent = (dest -> dim [j ]._ubound - dest -> dim [j ].lower_bound + 1 );
2342
- stride = dest -> dim [j ]._stride ;
2343
- tot_ext *= extent ;
2363
+ tot_ext *= extent ;
2344
2364
}
2345
2365
2346
- array_offset_dst += (i / tot_ext ) * dest -> dim [dst_rank - 1 ]._stride ;
2347
- dst_offset = offset + array_offset_dst * GFC_DESCRIPTOR_SIZE ( dest ) ;
2366
+ array_offset_dst += (i / tot_ext ) * dest -> dim [dst_rank - 1 ]._stride ;
2367
+ dst_offset = offset + array_offset_dst * dst_size ;
2348
2368
2349
2369
void * sr ;
2350
- if (GFC_DESCRIPTOR_RANK ( src ) != 0 )
2370
+ if (src_rank != 0 )
2351
2371
{
2352
2372
ptrdiff_t array_offset_sr = 0 ;
2353
- stride = 1 ;
2354
2373
extent = 1 ;
2355
- tot_ext = 1 ;
2356
- for (j = 0 ; j < GFC_DESCRIPTOR_RANK ( src ) - 1 ; j ++ )
2374
+ tot_ext = 1 ;
2375
+ for (j = 0 ; j < src_rank - 1 ; ++ j )
2357
2376
{
2358
2377
array_offset_sr += ((i / tot_ext )
2359
2378
% (src -> dim [j ]._ubound
2360
2379
- src -> dim [j ].lower_bound + 1 ))
2361
2380
* src -> dim [j ]._stride ;
2362
2381
extent = (src -> dim [j ]._ubound - src -> dim [j ].lower_bound + 1 );
2363
- stride = src -> dim [j ]._stride ;
2364
- tot_ext *= extent ;
2382
+ tot_ext *= extent ;
2365
2383
}
2366
2384
2367
2385
array_offset_sr += (i / tot_ext ) * src -> dim [dst_rank - 1 ]._stride ;
2368
2386
sr = (void * )((char * ) src -> base_addr
2369
- + array_offset_sr * GFC_DESCRIPTOR_SIZE ( src ) );
2387
+ + array_offset_sr * src_size );
2370
2388
}
2371
2389
else
2372
2390
sr = src -> base_addr ;
2373
2391
2374
- if (caf_this_image == image_index )
2392
+ if (! same_image )
2375
2393
{
2376
- if (!mrt )
2377
- memmove (dest -> base_addr + dst_offset ,sr ,GFC_DESCRIPTOR_SIZE (src ));
2394
+ // Do the more likely first.
2395
+ dprint ("%d/%d: %s() kind(dst) = %d, el_sz(dst) = %d, kind(src) = %d, el_sz(src) = %d.\n" ,
2396
+ caf_this_image , caf_num_images , __FUNCTION__ , dst_kind ,
2397
+ dst_size , src_kind , src_size );
2398
+ if (same_type_and_kind )
2399
+ ierr = MPI_Put (sr , dst_size , MPI_BYTE , remote_image ,
2400
+ dst_offset , dst_size , MPI_BYTE , * p );
2378
2401
else
2379
2402
{
2380
- memmove (t_buff + i * GFC_DESCRIPTOR_SIZE (src ),sr ,GFC_DESCRIPTOR_SIZE (src ));
2381
- buff_map [i ] = true;
2403
+ convert_type (t_buff , dst_type , dst_kind ,
2404
+ sr , src_type , src_kind , stat );
2405
+ ierr = MPI_Put (t_buff , dst_size , MPI_BYTE , remote_image ,
2406
+ dst_offset , dst_size , MPI_BYTE , * p );
2382
2407
}
2408
+ if (pad_str )
2409
+ ierr = MPI_Put (pad_str , dst_size - src_size , MPI_BYTE , remote_image ,
2410
+ dst_offset , dst_size - src_size , MPI_BYTE , * p );
2383
2411
}
2384
2412
else
2385
2413
{
2386
- CAF_Win_lock (MPI_LOCK_EXCLUSIVE , image_index - 1 , * p );
2387
- ierr = MPI_Put (sr , GFC_DESCRIPTOR_SIZE (dest ), MPI_BYTE , image_index - 1 ,
2388
- dst_offset , GFC_DESCRIPTOR_SIZE (dest ), MPI_BYTE , * p );
2389
- if (pad_str )
2390
- ierr = MPI_Put (pad_str , dst_size - src_size , MPI_BYTE , image_index - 1 ,
2391
- dst_offset , dst_size - src_size , MPI_BYTE , * p );
2392
- CAF_Win_unlock (image_index - 1 , * p );
2414
+ if (!mrt )
2415
+ {
2416
+ dprint ("%d/%d: %s() strided same_image, no temp, for i = %d, dst_offset = %d.\n" ,
2417
+ caf_this_image , caf_num_images , __FUNCTION__ , i ,
2418
+ dst_offset );
2419
+ if (same_type_and_kind )
2420
+ memmove (dest -> base_addr + dst_offset , sr , src_size );
2421
+ else
2422
+ convert_type (dest -> base_addr + dst_offset , dst_type ,
2423
+ dst_kind , sr , src_type , src_kind , stat );
2424
+ }
2425
+ else
2426
+ {
2427
+ dprint ("%d/%d: %s() strided same_image, *WITH* temp, for i = %d.\n" ,
2428
+ caf_this_image , caf_num_images , __FUNCTION__ , i );
2429
+ if (same_type_and_kind )
2430
+ memmove (t_buff + i * dst_size , sr , src_size );
2431
+ else
2432
+ convert_type (t_buff + i * dst_size , dst_type , dst_kind ,
2433
+ sr , src_type , src_kind , stat );
2434
+ }
2393
2435
}
2394
2436
2395
2437
#ifndef WITH_FAILED_IMAGES
@@ -2400,42 +2442,39 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
2400
2442
}
2401
2443
#endif
2402
2444
}
2445
+ if (!same_image )
2446
+ CAF_Win_unlock (remote_image , * p );
2403
2447
2404
- if (caf_this_image == image_index && mrt )
2448
+
2449
+ if (same_image && mrt )
2405
2450
{
2406
- for (i = 0 ; i < size ;i ++ )
2451
+ for (i = 0 ; i < size ; ++ i )
2407
2452
{
2408
- if (buff_map [i ])
2453
+ ptrdiff_t array_offset_dst = 0 ;
2454
+ ptrdiff_t extent = 1 ;
2455
+ ptrdiff_t tot_ext = 1 ;
2456
+ for (j = 0 ; j < dst_rank - 1 ; j ++ )
2409
2457
{
2410
- ptrdiff_t array_offset_dst = 0 ;
2411
- ptrdiff_t stride = 1 ;
2412
- ptrdiff_t extent = 1 ;
2413
- ptrdiff_t tot_ext = 1 ;
2414
- for (j = 0 ; j < dst_rank - 1 ; j ++ )
2415
- {
2416
- array_offset_dst += ((i / tot_ext )
2417
- % (dest -> dim [j ]._ubound
2418
- - dest -> dim [j ].lower_bound + 1 ))
2419
- * dest -> dim [j ]._stride ;
2420
- extent = (dest -> dim [j ]._ubound - dest -> dim [j ].lower_bound + 1 );
2421
- stride = dest -> dim [j ]._stride ;
2422
- tot_ext *= extent ;
2423
- }
2424
-
2425
- //extent = (dest->dim[rank-1]._ubound - dest->dim[rank-1].lower_bound + 1);
2426
- array_offset_dst += (i / tot_ext ) * dest -> dim [dst_rank - 1 ]._stride ;
2427
- dst_offset = offset + array_offset_dst * GFC_DESCRIPTOR_SIZE (dest );
2428
- memmove (src -> base_addr + dst_offset ,t_buff + i * GFC_DESCRIPTOR_SIZE (src ),GFC_DESCRIPTOR_SIZE (src ));
2458
+ array_offset_dst += ((i / tot_ext )
2459
+ % (dest -> dim [j ]._ubound
2460
+ - dest -> dim [j ].lower_bound + 1 ))
2461
+ * dest -> dim [j ]._stride ;
2462
+ extent = (dest -> dim [j ]._ubound - dest -> dim [j ].lower_bound + 1 );
2463
+ tot_ext *= extent ;
2429
2464
}
2465
+
2466
+ array_offset_dst += (i / tot_ext ) * dest -> dim [dst_rank - 1 ]._stride ;
2467
+ dst_offset = offset + array_offset_dst * dst_size ;
2468
+ memmove (dest -> base_addr + dst_offset , t_buff +
2469
+ i * dst_size , dst_size );
2430
2470
}
2431
- free (t_buff );
2432
- free (buff_map );
2433
2471
}
2434
- CAF_Win_unlock (image_index - 1 , * p );
2435
2472
#endif
2436
2473
}
2437
2474
2438
2475
/* Free memory, when not allocated on stack. */
2476
+ if (free_t_buff )
2477
+ free (t_buff );
2439
2478
if (free_pad_str )
2440
2479
free (pad_str );
2441
2480
0 commit comments