Skip to content

Commit 75ed756

Browse files
committed
improve unified performance on Frontier
1 parent 5b40ed1 commit 75ed756

File tree

1 file changed

+23
-35
lines changed

1 file changed

+23
-35
lines changed

src/simulation/m_time_steppers.fpp

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -569,10 +569,10 @@ contains
569569

570570
#ifdef FRONTIER_UNIFIED
571571
$:GPU_PARALLEL_LOOP(collapse=4)
572-
do l = 0, p
573-
do k = 0, n
574-
do j = 0, m
575-
do i = 1, sys_size
572+
do i = 1, sys_size
573+
do l = 0, p
574+
do k = 0, n
575+
do j = 0, m
576576
q_cons_ts(2)%vf(i)%sf(j, k, l) = &
577577
q_cons_ts(1)%vf(i)%sf(j, k, l)
578578
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
@@ -655,20 +655,16 @@ contains
655655
end if
656656

657657
! Stage 2 of 2
658-
#if defined(FRONTIER_UNIFIED)
659-
call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
660-
#else
661-
call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
662-
#endif
658+
call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
663659

664660
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
665661

666662
#ifdef FRONTIER_UNIFIED
667663
$:GPU_PARALLEL_LOOP(collapse=4)
668-
do l = 0, p
669-
do k = 0, n
670-
do j = 0, m
671-
do i = 1, sys_size
664+
do i = 1, sys_size
665+
do l = 0, p
666+
do k = 0, n
667+
do j = 0, m
672668
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
673669
(q_cons_ts(2)%vf(i)%sf(j, k, l) &
674670
+ q_cons_ts(1)%vf(i)%sf(j, k, l) &
@@ -810,10 +806,10 @@ contains
810806

811807
#ifdef FRONTIER_UNIFIED
812808
$:GPU_PARALLEL_LOOP(collapse=4)
813-
do l = 0, p
814-
do k = 0, n
815-
do j = 0, m
816-
do i = 1, sys_size
809+
do i = 1, sys_size
810+
do l = 0, p
811+
do k = 0, n
812+
do j = 0, m
817813
q_cons_ts(2)%vf(i)%sf(j, k, l) = &
818814
q_cons_ts(1)%vf(i)%sf(j, k, l)
819815
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
@@ -896,20 +892,16 @@ contains
896892
end if
897893

898894
! Stage 2 of 3
899-
#if defined(FRONTIER_UNIFIED)
900-
call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
901-
#else
902-
call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
903-
#endif
895+
call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
904896

905897
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
906898

907899
#if defined(FRONTIER_UNIFIED)
908900
$:GPU_PARALLEL_LOOP(collapse=4)
909-
do l = 0, p
910-
do k = 0, n
911-
do j = 0, m
912-
do i = 1, sys_size
901+
do i = 1, sys_size
902+
do l = 0, p
903+
do k = 0, n
904+
do j = 0, m
913905
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
914906
(3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
915907
+ q_cons_ts(1)%vf(i)%sf(j, k, l) &
@@ -993,20 +985,16 @@ contains
993985
end if
994986

995987
! Stage 3 of 3
996-
#ifdef FRONTIER_UNIFIED
997-
call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
998-
#else
999-
call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
1000-
#endif
988+
call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
1001989

1002990
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
1003991

1004992
#ifdef FRONTIER_UNIFIED
1005993
$:GPU_PARALLEL_LOOP(collapse=4)
1006-
do l = 0, p
1007-
do k = 0, n
1008-
do j = 0, m
1009-
do i = 1, sys_size
994+
do i = 1, sys_size
995+
do l = 0, p
996+
do k = 0, n
997+
do j = 0, m
1010998
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
1011999
(q_cons_ts(2)%vf(i)%sf(j, k, l) &
10121000
+ 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &

0 commit comments

Comments
 (0)