Add AMD compiler support, different macro expansions based on compiler

Tanush Prathi · Tanush Prathi · commit d52abd2bf202 · 2025-07-31T12:31:18.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -494,6 +494,9 @@ function(MFC_SETUP_TARGET)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
                 endif()
             endif()
 
@@ -533,6 +536,9 @@ function(MFC_SETUP_TARGET)
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
+            elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+                find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
+                target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
@@ -1,5 +1,11 @@
 #:include 'shared_parallel_macros.fpp'
 
+#:set NVIDIA_COMPILER_ID="NVHPC"
+#:set PGI_COMPILER_ID="PGI"
+#:set INTEL_COMPILER_ID="Intel"
+#:set CCE_COMPILER_ID="Cray"
+#:set AMD_COMPILER_ID="LLVMFlang"
+
 #:def OMP_MAP_STR(map_type, var_list)
     #:assert map_type is not None
     #:assert isinstance(map_type, str) 
@@ -17,8 +23,15 @@
         #:assert isinstance(default, str)
         #:assert (default == 'present' or default == 'none')
         #:if default == 'present'
-            #! #:set default_val = 'defaultmap(present:aggregate) defaultmap(present:allocatable) defaultmap(present:pointer) '
-            #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) '
+            #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID
+                #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) '
+            #:elif MFC_COMPILER == CCE_COMPILER_ID
+                #:set default_val = 'defaultmap(present:aggregate) defaultmap(present:allocatable) defaultmap(present:pointer) '
+            #:elif MFC_COMPILER == AMD_COMPILER_ID
+                #:set default_val = ''
+            #:else
+                #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) '
+            #:endif
         #:elif default == 'none'
             #:stop 'Not Supported Yet'
         #:endif
@@ -160,12 +173,22 @@
         & no_create_val.strip('\n') + present_val.strip('\n') + &
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
     #! Hardcoding the parallelism for now
-    !#:set omp_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' + &
-        !& clause_val + extraOmpArgs_val.strip('\n')
-    !#:set omp_end_directive = '!$omp end target teams loop'
-    #:set omp_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' + &
-        & clause_val + extraOmpArgs_val.strip('\n')
-    #:set omp_end_directive = '!$omp end target teams distribute parallel do simd'
+
+    #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID
+        #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
+        #:set omp_end_directive = '!$omp end target teams loop'
+    #:elif MFC_COMPILER == CCE_COMPILER_ID
+        #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) '
+        #:set omp_end_directive = '!$omp end target teams distribute parallel do simd'
+    #:elif MFC_COMPILER == AMD_COMPILER_ID
+        #:set omp_start_directive = '!$omp target teams distribute parallel do '
+        #:set omp_end_directive = '!$omp end target teams distribute parallel do'
+    #:else
+        #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
+        #:set omp_end_directive = '!$omp end target teams loop'
+    #:endif
+
+    #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n')
     $:omp_directive
     $:code
     $:omp_end_directive
@@ -184,7 +207,13 @@
     #:else
         #:set function_name_val = ''
     #:endif
-    #:set clause_val = nohost_val.strip('\n')
+    
+    #:if MFC_COMPILER == AMD_COMPILER_ID
+        #:set clause_val = ''
+    #:else
+        #:set clause_val = nohost_val.strip('\n')
+    #:endif
+
     #:set omp_directive = '!$omp declare target ' + &
         & clause_val + extraOmpArgs_val.strip('\n')
     $:omp_directive
@@ -201,11 +230,16 @@
     $:omp_directive
 #:enddef
 
-#! Not implemented yet
+#! Not fully implemented yet (ignores most args right now)
 #:def OMP_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraOmpArgs=None)
-    #! loop is going to be ignored since all loops right now are seq
-    #:set temp = ''
-    $:temp
+    #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID
+        #:set omp_directive = '!$omp loop bind(thread)'
+    #:elif MFC_COMPILER == CRAY_COMPILER_ID or MFC_COMPILER == AMD_COMPILER_ID
+        #:set omp_directive = ''
+    #:else
+        #:set omp_directive = ''
+    #:endif
+    $:omp_directive
 #:enddef
 
 #:def OMP_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraOmpArgs=None)
@@ -298,4 +332,22 @@
     #:set omp_directive = '!$omp barrier ' + clause_val + extraOmpArgs_val.strip('\n')
     $:omp_directive
 #:enddef
+
+#:def UNDEF_AMD(code)
+    #:if MFC_COMPILER != AMD_COMPILER_ID
+        $:code
+    #:endif
+#:enddef
+
+#:def UNDEF_CCE(code)
+    #:if MFC_COMPILER != CCE_COMPILER_ID
+        $:code
+    #:endif
+#:enddef
+
+#:def UNDEF_NVIDIA(code)
+    #:if MFC_COMPILER != NVIDIA_COMPILER_ID and MFC_COMPILER != PGI_COMPILER_ID
+        $:code
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
@@ -99,6 +99,7 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
+        #:block UNDEF_AMD
         #:call GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega]')
             do z = bounds(3)%beg, bounds(3)%end
                 do y = bounds(2)%beg, bounds(2)%end
@@ -127,6 +128,7 @@ contains
                 end do
             end do
         #:endcall GPU_PARALLEL_LOOP
+        #:endblock UNDEF_AMD
 
     end subroutine s_compute_chemistry_reaction_flux
 
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
@@ -773,6 +773,7 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
+                #:block UNDEF_AMD
                 #:call GPU_PARALLEL_LOOP(collapse=2, private='[alpha_rho, vel, adv_local, mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, dadv_dt, dalpha_rho_dt, L, lambda, Ys, dYs_dt, dYs_ds, h_k, Cp_i, Gamma_i, Xs]')
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1105,6 +1106,7 @@ contains
                         end do
                     end do
                 #:endcall GPU_PARALLEL_LOOP
+                #:endblock UNDEF_AMD
             end if
         #:endfor
 
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
@@ -136,7 +136,7 @@ contains
         integer :: i, j, k, l !< Generic loop iterators
         integer :: ierr !< Generic flag used to identify and report GPU errors
 
-#if 0
+#:block UNDEF_CCE
         ! Restrict filter to processors that have cells adjacent to axis
         if (bc_y%beg >= 0) return
 #if defined(MFC_GPU)
@@ -304,7 +304,8 @@ contains
             end do
         end do
 #endif
-#endif
+#:endblock UNDEF_CCE
+
     end subroutine s_apply_fourier_filter
 
     !>  The purpose of this subroutine is to destroy the fftw plan
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
@@ -2943,6 +2943,7 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
+                #:block UNDEF_AMD
                 #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3116,6 +3117,7 @@ contains
                         end do
                     end do
                 #:endcall GPU_PARALLEL_LOOP
+                #:endblock UNDEF_AMD
             end if
         #:endfor