-
Notifications
You must be signed in to change notification settings - Fork 121
Add line numbering to gpu loops #1029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Add line numbering to gpu loops #1029
Conversation
PR Reviewer Guide 🔍Here are some key observations to aid the review process:
|
| if (mhd) then | ||
| if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const. | ||
| ! B_y flux = v_x * B_y - v_y * Bx0 | ||
| ! B_z flux = v_x * B_z - v_z * Bx0 | ||
| $:GPU_LOOP(parallelism='[seq]') | ||
| do i = 1, 3 | ||
| ! Flux of rho*v_i in the ${XYZ}$ direction | ||
| ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot | ||
| flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & | ||
| (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) & | ||
| - B%R(i)*B%R(norm_dir) & | ||
| + dir_flg(i)*(pres_R + pres_mag%R)) & | ||
| - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) & | ||
| - B%L(i)*B%L(norm_dir) & | ||
| + dir_flg(i)*(pres_L + pres_mag%L)) & | ||
| + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) & | ||
| /(s_M - s_P) | ||
| do i = 0, 1 | ||
| flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) & | ||
| - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) & | ||
| + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P) | ||
| end do | ||
| elseif (mhd .and. relativity) then | ||
| else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction | ||
| ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$) | ||
| ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$) | ||
| ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$) | ||
| $:GPU_LOOP(parallelism='[seq]') | ||
| do i = 1, 3 | ||
| ! Flux of m_i in the ${XYZ}$ direction | ||
| ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot | ||
| flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & | ||
| (s_M*(cm%R(i)*vel_R(norm_dir) & | ||
| - b4%R(i)/Ga%R*B%R(norm_dir) & | ||
| + dir_flg(i)*(pres_R + pres_mag%R)) & | ||
| - s_P*(cm%L(i)*vel_L(norm_dir) & | ||
| - b4%L(i)/Ga%L*B%L(norm_dir) & | ||
| + dir_flg(i)*(pres_L + pres_mag%L)) & | ||
| + s_M*s_P*(cm%L(i) - cm%R(i))) & | ||
| /(s_M - s_P) | ||
| end do | ||
| elseif (bubbles_euler) then | ||
| $:GPU_LOOP(parallelism='[seq]') | ||
| do i = 1, num_vels | ||
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | ||
| (s_M*(rho_R*vel_R(dir_idx(1)) & | ||
| *vel_R(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & | ||
| - s_P*(rho_L*vel_L(dir_idx(1)) & | ||
| *vel_L(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & | ||
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | ||
| - rho_R*vel_R(dir_idx(i)))) & | ||
| /(s_M - s_P) & | ||
| + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) | ||
| end do | ||
| else if (hypoelasticity) then | ||
| $:GPU_LOOP(parallelism='[seq]') | ||
| do i = 1, num_vels | ||
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | ||
| (s_M*(rho_R*vel_R(dir_idx(1)) & | ||
| *vel_R(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*pres_R & | ||
| - tau_e_R(dir_idx_tau(i))) & | ||
| - s_P*(rho_L*vel_L(dir_idx(1)) & | ||
| *vel_L(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*pres_L & | ||
| - tau_e_L(dir_idx_tau(i))) & | ||
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | ||
| - rho_R*vel_R(dir_idx(i)))) & | ||
| /(s_M - s_P) | ||
| end do | ||
| else | ||
| $:GPU_LOOP(parallelism='[seq]') | ||
| do i = 1, num_vels | ||
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | ||
| (s_M*(rho_R*vel_R(dir_idx(1)) & | ||
| *vel_R(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*pres_R) & | ||
| - s_P*(rho_L*vel_L(dir_idx(1)) & | ||
| *vel_L(dir_idx(i)) & | ||
| + dir_flg(dir_idx(i))*pres_L) & | ||
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | ||
| - rho_R*vel_R(dir_idx(i)))) & | ||
| /(s_M - s_P) & | ||
| + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) | ||
| do i = 0, 2 | ||
| flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( & | ||
| s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - & | ||
| s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + & | ||
| s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P) | ||
| end do | ||
| end if | ||
| flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp | ||
| end if |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Restrict the 1D MHD flux calculation to only execute for the x-direction (NORM_DIR == 1) to prevent incorrect flux calculations for the y and z directions. [possible issue, importance: 9]
| if (mhd) then | |
| if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const. | |
| ! B_y flux = v_x * B_y - v_y * Bx0 | |
| ! B_z flux = v_x * B_z - v_z * Bx0 | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 1, 3 | |
| ! Flux of rho*v_i in the ${XYZ}$ direction | |
| ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot | |
| flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & | |
| (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) & | |
| - B%R(i)*B%R(norm_dir) & | |
| + dir_flg(i)*(pres_R + pres_mag%R)) & | |
| - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) & | |
| - B%L(i)*B%L(norm_dir) & | |
| + dir_flg(i)*(pres_L + pres_mag%L)) & | |
| + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) & | |
| /(s_M - s_P) | |
| do i = 0, 1 | |
| flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) & | |
| - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) & | |
| + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P) | |
| end do | |
| elseif (mhd .and. relativity) then | |
| else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction | |
| ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$) | |
| ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$) | |
| ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$) | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 1, 3 | |
| ! Flux of m_i in the ${XYZ}$ direction | |
| ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot | |
| flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & | |
| (s_M*(cm%R(i)*vel_R(norm_dir) & | |
| - b4%R(i)/Ga%R*B%R(norm_dir) & | |
| + dir_flg(i)*(pres_R + pres_mag%R)) & | |
| - s_P*(cm%L(i)*vel_L(norm_dir) & | |
| - b4%L(i)/Ga%L*B%L(norm_dir) & | |
| + dir_flg(i)*(pres_L + pres_mag%L)) & | |
| + s_M*s_P*(cm%L(i) - cm%R(i))) & | |
| /(s_M - s_P) | |
| end do | |
| elseif (bubbles_euler) then | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 1, num_vels | |
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | |
| (s_M*(rho_R*vel_R(dir_idx(1)) & | |
| *vel_R(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & | |
| - s_P*(rho_L*vel_L(dir_idx(1)) & | |
| *vel_L(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & | |
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | |
| - rho_R*vel_R(dir_idx(i)))) & | |
| /(s_M - s_P) & | |
| + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) | |
| end do | |
| else if (hypoelasticity) then | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 1, num_vels | |
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | |
| (s_M*(rho_R*vel_R(dir_idx(1)) & | |
| *vel_R(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*pres_R & | |
| - tau_e_R(dir_idx_tau(i))) & | |
| - s_P*(rho_L*vel_L(dir_idx(1)) & | |
| *vel_L(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*pres_L & | |
| - tau_e_L(dir_idx_tau(i))) & | |
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | |
| - rho_R*vel_R(dir_idx(i)))) & | |
| /(s_M - s_P) | |
| end do | |
| else | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 1, num_vels | |
| flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & | |
| (s_M*(rho_R*vel_R(dir_idx(1)) & | |
| *vel_R(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*pres_R) & | |
| - s_P*(rho_L*vel_L(dir_idx(1)) & | |
| *vel_L(dir_idx(i)) & | |
| + dir_flg(dir_idx(i))*pres_L) & | |
| + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & | |
| - rho_R*vel_R(dir_idx(i)))) & | |
| /(s_M - s_P) & | |
| + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) | |
| do i = 0, 2 | |
| flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( & | |
| s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - & | |
| s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + & | |
| s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P) | |
| end do | |
| end if | |
| flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp | |
| end if | |
| if (mhd) then | |
| if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const. | |
| #:if (NORM_DIR == 1) | |
| ! B_y flux = v_x * B_y - v_y * Bx0 | |
| ! B_z flux = v_x * B_z - v_z * Bx0 | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 0, 1 | |
| flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) & | |
| - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) & | |
| + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P) | |
| end do | |
| #:endif | |
| else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction | |
| ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$) | |
| ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$) | |
| ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$) | |
| $:GPU_LOOP(parallelism='[seq]') | |
| do i = 0, 2 | |
| flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( & | |
| s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - & | |
| s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + & | |
| s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P) | |
| end do | |
| end if | |
| flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp | |
| end if |
| rho_visc = 0._wp | ||
| gamma_visc = 0._wp | ||
| pi_inf_visc = 0._wp | ||
| $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Add the loop variable q to the private clause of the GPU_PARALLEL_LOOP directive to prevent a race condition. [possible issue, importance: 9]
| $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') | |
| $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,q,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') |
|
I just builkt this on frontier manually and both omp and acc compile fine. I am going to rerun the build for frontier. |
Codecov Report❌ Patch coverage is Additional details and impacted files@@ Coverage Diff @@
## master #1029 +/- ##
==========================================
- Coverage 46.02% 44.39% -1.64%
==========================================
Files 67 70 +3
Lines 13437 20330 +6893
Branches 1550 1947 +397
==========================================
+ Hits 6185 9025 +2840
- Misses 6362 10211 +3849
- Partials 890 1094 +204 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
running this by @anandrdbz and @prathi-wind. also the MHD parts might need to be examined by @ChrisZYJ |
|
We can get their input. Luckily a few tests are actually passing on Frontier, and the ones that are failing appear to mostly have |
…ectives getting printed. Thanks Anand for looking at things with me
…was never called, breaking the IO
User description
Description
I have refactored the to separate out the openMP and openACC macros into two separate macros. One for starting a parallel loop,
GPU_PARALLEL_LOOPand one for ending a parallel regionEND_GPU_PARALLEL_LOOP. This allows the preprocessor to only insert new lines at the start and end of GPU parallel loops. This means that the code is not able to allow line markers to progress naturally instead of all lines inside a loop having the same line number.I have also adjusted the macro files to remove the additional warnings that were being printed with every single file that has GPU macros.
The result is that this eliminates over half of the warnings that are being printed when the code compiles, and the line numbers for errors are correct. I have tested this locally and found that the line numbers reported are correct, at least in regions that I checked. The result should be a much improved developer experience.
Fixes #1028
Type of change
Please delete options that are not relevant.
Scope
If you cannot check the above box, please split your PR into multiple PRs that each have a common goal.
How Has This Been Tested?
I compiled locally and saw that the warnings were removed. I also tried making intentional changes and saw that the correct line numbers were reported.
Test Configuration:
Checklist
./mfc.sh formatbefore committing my codeIf your code changes any code source files (anything in
src/simulation)To make sure the code is performing as expected on GPU devices, I have:
PR Type
Enhancement
Description
Refactored GPU parallel loop macros from
#:call GPU_PARALLEL_LOOP()/#:endcall GPU_PARALLEL_LOOPsyntax to$:GPU_PARALLEL_LOOP()/$:END_GPU_PARALLEL_LOOP()syntaxSeparated OpenMP and OpenACC macro implementations into distinct start and end directives (
OMP_PARALLEL_LOOP/END_OMP_PARALLEL_LOOPandACC_PARALLEL_LOOP/END_GPU_PARALLEL_LOOP)Added explicit
privateclause parameters to all GPU parallel loop macros to specify privatized loop variablesModified macro definitions to emit only opening directives, allowing loop code to follow separately for improved line numbering accuracy
Applied changes consistently across multiple simulation and common modules including viscous stress, RHS, pressure relaxation, CBC, levelset computations, and MPI common utilities
Adjusted indentation of loop bodies to align with new macro structure
Removed embedded code execution from macro implementations to allow line markers to progress naturally
Result eliminates over half of compilation warnings and provides correct line numbers for error reporting
Diagram Walkthrough
File Walkthrough
8 files
m_viscous.fpp
Refactor GPU parallel loop macros for improved line numberingsrc/simulation/m_viscous.fpp
#:call GPU_PARALLEL_LOOP()/#:endcall GPU_PARALLEL_LOOPto$:GPU_PARALLEL_LOOP()/$:END_GPU_PARALLEL_LOOP()syntaxprivateparameter declarations to allGPU_PARALLEL_LOOPmacro calls to specify loop variables
syntax requirements
s_compute_viscous_stress_tensorand gradient computation functionsacc_macros.fpp
Separate ACC parallel loop opening directive from codesrc/common/include/acc_macros.fpp
ACC_PARALLEL_LOOPmacro definition to removecodeparameterand
acc_end_directivefrom macro bodyfollow separately
implementation
m_rhs.fpp
Refactor GPU loop macros with explicit private variablessrc/simulation/m_rhs.fpp
#:call GPU_PARALLEL_LOOP()and
#:endcall GPU_PARALLEL_LOOPto$:GPU_PARALLEL_LOOP()and$:END_GPU_PARALLEL_LOOP()syntaxprivateclause to allGPU_PARALLEL_LOOPmacrosspecifying loop variables to be privatized
(removed extra indentation level)
throughout the file
m_mpi_common.fpp
Update GPU macros with explicit private variable declarationssrc/common/m_mpi_common.fpp
#:call GPU_PARALLEL_LOOP()to$:GPU_PARALLEL_LOOP()syntaxprivateclause listing all loop variables (i,j,k,l,q,r) to GPU parallel regions#:endcall GPU_PARALLEL_LOOPto$:END_GPU_PARALLEL_LOOP()m_pressure_relaxation.fpp
Refactor GPU parallel loop macros with private variablessrc/simulation/m_pressure_relaxation.fpp
#:call GPU_PARALLEL_LOOP()with$:GPU_PARALLEL_LOOP()macrosyntax
private='[j,k,l]'clause to specify privatizedvariables
#:endcall GPU_PARALLEL_LOOPto$:END_GPU_PARALLEL_LOOP()m_cbc.fpp
Refactor GPU loop macros to separate start and end directivessrc/simulation/m_cbc.fpp
#:call GPU_PARALLEL_LOOP()/#:endcall GPU_PARALLEL_LOOPto$:GPU_PARALLEL_LOOP()/$:END_GPU_PARALLEL_LOOP()syntaxprivateclause parameters to allGPU_PARALLEL_LOOPcalls specifying loop variables
removing extra indentation from the old call-based approach
flux reshaping, and output operations
m_compute_levelset.fpp
Refactor GPU parallel loop macros in levelset computationssrc/common/m_compute_levelset.fpp
#:call GPU_PARALLEL_LOOP()/#:endcall GPU_PARALLEL_LOOPinvocations to
$:GPU_PARALLEL_LOOP()/$:END_GPU_PARALLEL_LOOP()syntax
privateclause with loop variable lists to all GPUparallel loop macros
structure
airfoil, 3D airfoil, rectangle, cuboid, sphere, cylinder)
omp_macros.fpp
Split GPU parallel loop macro into separate start and end macrossrc/common/include/omp_macros.fpp
OMP_PARALLEL_LOOPmacro into two parts: start directivegeneration and end directive generation
codeparameter fromOMP_PARALLEL_LOOPmacro definitionEND_OMP_PARALLEL_LOOPmacro to emit appropriate enddirectives based on compiler type
$:codeexecution fromOMP_PARALLEL_LOOP, allowing loopbodies to be written directly in code
1 files
shared_parallel_macros.fpp
Add required newline at end of filesrc/common/include/shared_parallel_macros.fpp
27 files