Skip to content

Commit d406423

Browse files
[Libraries/MPI] refactoring of device-initiated Jacobian sample
1 parent aa3836f commit d406423

File tree

1 file changed

+57
-48
lines changed

1 file changed

+57
-48
lines changed

Libraries/MPI/jacobian_solver/src/03_jacobian_device_mpi_one-sided_device_initiated/mpi3_onesided_jacobian_gpu_sycl_device_initiated.cpp

Lines changed: 57 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,10 @@ int main(int argc, char *argv[])
127127
t_start = MPI_Wtime();
128128
}
129129

130-
for (int i = 0; i < Niter; ++i)
131-
{
132-
MPI_Win cwin = win[(i + 1) % 2];
133-
double *a = A_device[i % 2];
134-
double *a_out = A_device[(i + 1) % 2];
130+
int iterations_batch = (NormIteration <= 0) ? Niter : NormIteration;
131+
for (passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
135132

133+
/* Submit compute kernel to calculate next "iterations_batch" steps */
136134
q.submit([&](auto & h) {
137135
h.parallel_for(sycl::nd_range<1>(work_group_size, work_group_size),
138136
[=](sycl::nd_item<1> item) {
@@ -143,52 +141,66 @@ int main(int argc, char *argv[])
143141
int my_x_lb = col_per_wg * local_id;
144142
int my_x_ub = my_x_lb + col_per_wg;
145143

146-
/* Calculate values on borders to initiate communications early */
147-
for (int column = my_x_lb; column < my_x_ub; column ++) {
148-
int idx = XY_2_IDX(column, 0, my_subarray);
149-
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
150-
+ a[idx - ROW_SIZE(my_subarray)]
151-
+ a[idx + ROW_SIZE(my_subarray)]);
152-
idx = XY_2_IDX(column, my_subarray.y_size - 1, my_subarray);
153-
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
154-
+ a[idx - ROW_SIZE(my_subarray)]
155-
+ a[idx + ROW_SIZE(my_subarray)]);
144+
for (int k = 0; k < iterations_batch; ++k)
145+
{
146+
int i = passed_iters + k;
147+
MPI_Win cwin = win[(i + 1) % 2];
148+
double *a = A_device[i % 2];
149+
double *a_out = A_device[(i + 1) % 2];
150+
/* Calculate values on borders to initiate communications early */
151+
for (int column = my_x_lb; column < my_x_ub; column ++) {
152+
int idx = XY_2_IDX(column, 0, my_subarray);
153+
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
154+
+ a[idx - ROW_SIZE(my_subarray)]
155+
+ a[idx + ROW_SIZE(my_subarray)]);
156+
idx = XY_2_IDX(column, my_subarray.y_size - 1, my_subarray);
157+
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
158+
+ a[idx - ROW_SIZE(my_subarray)]
159+
+ a[idx + ROW_SIZE(my_subarray)]);
160+
}
161+
162+
item.barrier(sycl::access::fence_space::global_space);
163+
if (local_id == 0) {
164+
/* Perform 1D halo-exchange with neighbours */
165+
if (my_subarray.rank != 0) {
166+
int idx = XY_2_IDX(0, 0, my_subarray);
167+
MPI_Put(&a_out[idx], my_subarray.x_size, MPI_DOUBLE,
168+
my_subarray.rank - 1, my_subarray.l_nbh_offt,
169+
my_subarray.x_size, MPI_DOUBLE, cwin);
170+
}
171+
172+
if (my_subarray.rank != (my_subarray.comm_size - 1)) {
173+
int idx = XY_2_IDX(0, my_subarray.y_size - 1, my_subarray);
174+
MPI_Put(&a_out[idx], my_subarray.x_size, MPI_DOUBLE,
175+
my_subarray.rank + 1, 1,
176+
my_subarray.x_size, MPI_DOUBLE, cwin);
177+
}
178+
}
179+
180+
/* Recalculate internal points in parallel with comunications */
181+
for (int row = 1; row < my_subarray.y_size - 1; ++row) {
182+
for (int column = my_x_lb; column < my_x_ub; column ++) {
183+
int idx = XY_2_IDX(column, row, my_subarray);
184+
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
185+
+ a[idx - ROW_SIZE(my_subarray)]
186+
+ a[idx + ROW_SIZE(my_subarray)]);
187+
}
188+
}
189+
item.barrier(sycl::access::fence_space::global_space);
190+
/* Ensure all communications complete before next iteration */
191+
if (local_id == 0) {
192+
MPI_Win_fence(0, cwin);
193+
}
194+
item.barrier(sycl::access::fence_space::global_space);
156195
}
157-
158-
item.barrier(sycl::access::fence_space::global_space);
159-
if (local_id == 0) {
160-
/* Perform 1D halo-exchange with neighbours */
161-
if (my_subarray.rank != 0) {
162-
int idx = XY_2_IDX(0, 0, my_subarray);
163-
MPI_Put(&a_out[idx], my_subarray.x_size, MPI_DOUBLE,
164-
my_subarray.rank - 1, my_subarray.l_nbh_offt,
165-
my_subarray.x_size, MPI_DOUBLE, cwin);
166-
}
167-
168-
if (my_subarray.rank != (my_subarray.comm_size - 1)) {
169-
int idx = XY_2_IDX(0, my_subarray.y_size - 1, my_subarray);
170-
MPI_Put(&a_out[idx], my_subarray.x_size, MPI_DOUBLE,
171-
my_subarray.rank + 1, 1,
172-
my_subarray.x_size, MPI_DOUBLE, cwin);
173-
}
174-
}
175-
176-
/* Recalculate internal points in parallel with comunications */
177-
for (int row = 1; row < my_subarray.y_size - 1; ++row) {
178-
for (int column = my_x_lb; column < my_x_ub; column ++) {
179-
int idx = XY_2_IDX(column, row, my_subarray);
180-
a_out[idx] = 0.25 * (a[idx - 1] + a[idx + 1]
181-
+ a[idx - ROW_SIZE(my_subarray)]
182-
+ a[idx + ROW_SIZE(my_subarray)]);
183-
}
184-
}
185-
item.barrier(sycl::access::fence_space::global_space);
186196
});
187197
}).wait();
188198

199+
189200
/* Calculate and report norm value after given number of iterations */
190201
if ((NormIteration > 0) && ((NormIteration - 1) == i % NormIteration)) {
191202
double rank_norm = 0.0;
203+
192204
{
193205
sycl::buffer<double> norm_buf(&rank_norm, 1);
194206
q.submit([&](auto & h) {
@@ -204,12 +216,9 @@ int main(int argc, char *argv[])
204216
/* Get global norm value */
205217
MPI_Reduce(&rank_norm, &norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
206218
if (my_subarray.rank == 0) {
207-
printf("NORM value on iteration %d: %f\n", passed_iters + batch_iters + 1, sqrt(norm));
219+
printf("NORM value on iteration %d: %f\n", i+1, sqrt(norm));
208220
}
209-
rank_norm = 0.0;
210221
}
211-
/* Ensure all communications complete before next iteration */
212-
MPI_Win_fence(0, cwin);
213222
}
214223

215224
if (PrintTime) {

0 commit comments

Comments
 (0)