@@ -127,12 +127,10 @@ int main(int argc, char *argv[])
127
127
t_start = MPI_Wtime ();
128
128
}
129
129
130
- for (int i = 0 ; i < Niter; ++i)
131
- {
132
- MPI_Win cwin = win[(i + 1 ) % 2 ];
133
- double *a = A_device[i % 2 ];
134
- double *a_out = A_device[(i + 1 ) % 2 ];
130
+ int iterations_batch = (NormIteration <= 0 ) ? Niter : NormIteration;
131
+ for (passed_iters = 0 ; passed_iters < Niter; passed_iters += iterations_batch) {
135
132
133
+ /* Submit compute kernel to calculate next "iterations_batch" steps */
136
134
q.submit ([&](auto & h) {
137
135
h.parallel_for (sycl::nd_range<1 >(work_group_size, work_group_size),
138
136
[=](sycl::nd_item<1 > item) {
@@ -143,52 +141,66 @@ int main(int argc, char *argv[])
143
141
int my_x_lb = col_per_wg * local_id;
144
142
int my_x_ub = my_x_lb + col_per_wg;
145
143
146
- /* Calculate values on borders to initiate communications early */
147
- for (int column = my_x_lb; column < my_x_ub; column ++) {
148
- int idx = XY_2_IDX (column, 0 , my_subarray);
149
- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
150
- + a[idx - ROW_SIZE (my_subarray)]
151
- + a[idx + ROW_SIZE (my_subarray)]);
152
- idx = XY_2_IDX (column, my_subarray.y_size - 1 , my_subarray);
153
- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
154
- + a[idx - ROW_SIZE (my_subarray)]
155
- + a[idx + ROW_SIZE (my_subarray)]);
144
+ for (int k = 0 ; k < iterations_batch; ++k)
145
+ {
146
+ int i = passed_iters + k;
147
+ MPI_Win cwin = win[(i + 1 ) % 2 ];
148
+ double *a = A_device[i % 2 ];
149
+ double *a_out = A_device[(i + 1 ) % 2 ];
150
+ /* Calculate values on borders to initiate communications early */
151
+ for (int column = my_x_lb; column < my_x_ub; column ++) {
152
+ int idx = XY_2_IDX (column, 0 , my_subarray);
153
+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
154
+ + a[idx - ROW_SIZE (my_subarray)]
155
+ + a[idx + ROW_SIZE (my_subarray)]);
156
+ idx = XY_2_IDX (column, my_subarray.y_size - 1 , my_subarray);
157
+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
158
+ + a[idx - ROW_SIZE (my_subarray)]
159
+ + a[idx + ROW_SIZE (my_subarray)]);
160
+ }
161
+
162
+ item.barrier (sycl::access::fence_space::global_space);
163
+ if (local_id == 0 ) {
164
+ /* Perform 1D halo-exchange with neighbours */
165
+ if (my_subarray.rank != 0 ) {
166
+ int idx = XY_2_IDX (0 , 0 , my_subarray);
167
+ MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
168
+ my_subarray.rank - 1 , my_subarray.l_nbh_offt ,
169
+ my_subarray.x_size , MPI_DOUBLE, cwin);
170
+ }
171
+
172
+ if (my_subarray.rank != (my_subarray.comm_size - 1 )) {
173
+ int idx = XY_2_IDX (0 , my_subarray.y_size - 1 , my_subarray);
174
+ MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
175
+ my_subarray.rank + 1 , 1 ,
176
+ my_subarray.x_size , MPI_DOUBLE, cwin);
177
+ }
178
+ }
179
+
180
+ /* Recalculate internal points in parallel with comunications */
181
+ for (int row = 1 ; row < my_subarray.y_size - 1 ; ++row) {
182
+ for (int column = my_x_lb; column < my_x_ub; column ++) {
183
+ int idx = XY_2_IDX (column, row, my_subarray);
184
+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
185
+ + a[idx - ROW_SIZE (my_subarray)]
186
+ + a[idx + ROW_SIZE (my_subarray)]);
187
+ }
188
+ }
189
+ item.barrier (sycl::access::fence_space::global_space);
190
+ /* Ensure all communications complete before next iteration */
191
+ if (local_id == 0 ) {
192
+ MPI_Win_fence (0 , cwin);
193
+ }
194
+ item.barrier (sycl::access::fence_space::global_space);
156
195
}
157
-
158
- item.barrier (sycl::access::fence_space::global_space);
159
- if (local_id == 0 ) {
160
- /* Perform 1D halo-exchange with neighbours */
161
- if (my_subarray.rank != 0 ) {
162
- int idx = XY_2_IDX (0 , 0 , my_subarray);
163
- MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
164
- my_subarray.rank - 1 , my_subarray.l_nbh_offt ,
165
- my_subarray.x_size , MPI_DOUBLE, cwin);
166
- }
167
-
168
- if (my_subarray.rank != (my_subarray.comm_size - 1 )) {
169
- int idx = XY_2_IDX (0 , my_subarray.y_size - 1 , my_subarray);
170
- MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
171
- my_subarray.rank + 1 , 1 ,
172
- my_subarray.x_size , MPI_DOUBLE, cwin);
173
- }
174
- }
175
-
176
- /* Recalculate internal points in parallel with comunications */
177
- for (int row = 1 ; row < my_subarray.y_size - 1 ; ++row) {
178
- for (int column = my_x_lb; column < my_x_ub; column ++) {
179
- int idx = XY_2_IDX (column, row, my_subarray);
180
- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
181
- + a[idx - ROW_SIZE (my_subarray)]
182
- + a[idx + ROW_SIZE (my_subarray)]);
183
- }
184
- }
185
- item.barrier (sycl::access::fence_space::global_space);
186
196
});
187
197
}).wait ();
188
198
199
+
189
200
/* Calculate and report norm value after given number of iterations */
190
201
if ((NormIteration > 0 ) && ((NormIteration - 1 ) == i % NormIteration)) {
191
202
double rank_norm = 0.0 ;
203
+
192
204
{
193
205
sycl::buffer<double > norm_buf (&rank_norm, 1 );
194
206
q.submit ([&](auto & h) {
@@ -204,12 +216,9 @@ int main(int argc, char *argv[])
204
216
/* Get global norm value */
205
217
MPI_Reduce (&rank_norm, &norm, 1 , MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD);
206
218
if (my_subarray.rank == 0 ) {
207
- printf (" NORM value on iteration %d: %f\n " , passed_iters + batch_iters + 1 , sqrt (norm));
219
+ printf (" NORM value on iteration %d: %f\n " , i+ 1 , sqrt (norm));
208
220
}
209
- rank_norm = 0.0 ;
210
221
}
211
- /* Ensure all communications complete before next iteration */
212
- MPI_Win_fence (0 , cwin);
213
222
}
214
223
215
224
if (PrintTime) {
0 commit comments