From d8730b1d40133028f29d7378686df830cee1c361 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@login-phoenix-rh9-1.pace.gatech.edu>
Date: Tue, 3 Jun 2025 14:19:38 -0400
Subject: [PATCH 01/75] Created parallel loop macro

---
 src/common/include/macros.fpp | 49 +++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index cdaa48ff58..b738676c9b 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -102,3 +102,52 @@
                          //${message or '"No error description."'}$)
     end if
 #:enddef
+
+#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None)
+    #:if collapse is not None
+        #:assert isinstance(collapse, int)
+        #:assert collapse > 0
+        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+    #:else
+        #:set collapse_val = ""
+    #:endif
+
+    #:if private is not None
+        #:assert isinstance(private, list)
+        #:assert len(private) != 0
+        #:assert all(type(element) == str for element in private)
+        #:set private_val = 'private(' + ', '.join(private) + ') '
+    #:else
+        #:set private_val = ""
+    #:endif
+
+    #:if default is not None
+        #:assert isinstance(default, str)
+        #:assert (default == "present" or default == "none")
+        #:set default_val = 'default(' + default + ') '
+    #:else
+        #:set default_val = ""
+    #:endif
+
+    #:if parallelism is not None
+        #:assert isinstance(parallelism, list)
+        #:assert len(parallelism) != 0
+        #:assert all(type(element) == str for element in parallelism)
+        #:set parallelism_val = " ".join(parallelism) + " "
+    #:else
+        #:set parallelism_val = ""
+    #:endif
+
+    #:if firstprivate is not None
+        #:assert isinstance(firstprivate, list)
+        #:assert len(firstprivate) != 0
+        #:assert all(type(element) == str for element in firstprivate)
+        #:set firstprivate_val = 'firstprivate(' + ', '.join(firstprivate) + ') '
+    #:else
+        #:set firstprivate_val = ""
+    #:endif
+
+    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val
+    !$acc parallel loop ${clause_val}$
+
+#:enddef

From 8eec3d65f25411354a5dd6add634bbcb0b071b41 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 3 Jun 2025 16:52:40 -0400
Subject: [PATCH 02/75] Added more args to parallel loop macro and replaced
 some directives

---
 src/common/include/macros.fpp     | 47 +++++++++++++++++++++++++++++--
 src/common/m_boundary_common.fpp  | 24 ++++++++--------
 src/common/m_chemistry.fpp        |  3 +-
 src/simulation/m_acoustic_src.fpp |  2 +-
 src/simulation/m_bubbles_EL.fpp   |  2 +-
 src/simulation/m_cbc.fpp          |  7 ++++-
 src/simulation/m_hypoelastic.fpp  |  2 +-
 src/simulation/m_rhs.fpp          |  4 +--
 src/simulation/m_viscous.fpp      | 18 ++++++------
 9 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index b738676c9b..1e692f6220 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -103,7 +103,7 @@
     end if
 #:enddef
 
-#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None)
+#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None)
     #:if collapse is not None
         #:assert isinstance(collapse, int)
         #:assert collapse > 0
@@ -133,6 +133,7 @@
         #:assert isinstance(parallelism, list)
         #:assert len(parallelism) != 0
         #:assert all(type(element) == str for element in parallelism)
+        #:assert all((element == "gang" or element == "worker" or element == "vector") for element in parallelism)
         #:set parallelism_val = " ".join(parallelism) + " "
     #:else
         #:set parallelism_val = ""
@@ -147,7 +148,49 @@
         #:set firstprivate_val = ""
     #:endif
 
-    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val
+    #:if copy is not None
+        #:assert isinstance(copy, list)
+        #:assert len(copy) != 0
+        #:assert all(type(element) == str for element in copy)
+        #:set copy_val = 'copy(' + ', '.join(copy) + ') '
+    #:else
+        #:set copy_val = ""
+    #:endif
+    
+    #:if copyin is not None
+        #:assert isinstance(copyin, list)
+        #:assert len(copyin) != 0
+        #:assert all(type(element) == str for element in copyin)
+        #:assert isinstance(copyinReadOnly, bool)
+        #:if copyinReadOnly == True
+            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
+        #:else
+            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
+        #:endif
+    #:else
+        #:set copyin_val = ""
+    #:endif
+
+    #:if copyout is not None
+        #:assert isinstance(copyout, list)
+        #:assert len(copyout) != 0
+        #:assert all(type(element) == str for element in copyout)
+        #:set copyout_val = 'copyout(' + ', '.join(copyout) + ') '
+    #:else
+        #:set copyout_val = ""
+    #:endif
+
+    #:if create is not None
+        #:assert isinstance(create, list)
+        #:assert len(create) != 0
+        #:assert all(type(element) == str for element in create)
+        #:set create_val = 'create(' + ', '.join(create) + ') '
+    #:else
+        #:set create_val = ""
+    #:endif
+
+
+    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + copy_val + copyin_val + copyout_val + create_val
     !$acc parallel loop ${clause_val}$
 
 #:enddef
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index ea72026dfc..1432e53a7c 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -83,7 +83,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, -1)%sf(0, k, l)))
@@ -107,7 +107,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, 1)%sf(0, k, l)))
@@ -135,7 +135,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, -1)%sf(k, 0, l)))
@@ -161,7 +161,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, 1)%sf(k, 0, l)))
@@ -189,7 +189,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, -1)%sf(k, l, 0)))
@@ -213,7 +213,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, 1)%sf(k, l, 0)))
@@ -1167,7 +1167,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, -1)%sf(0, k, l))
@@ -1185,7 +1185,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, 1)%sf(0, k, l))
@@ -1206,7 +1206,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, -1)%sf(k, 0, l))
@@ -1224,7 +1224,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, 1)%sf(k, 0, l))
@@ -1245,7 +1245,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, -1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, -1)%sf(k, l, 0))
@@ -1263,7 +1263,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, 1)%sf(k, l, 0))
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index d61c42b1af..ad3e3c7187 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -99,8 +99,7 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(Ys, omega)
+        $:parallel_loop(collapse=3, private=["Ys", "omega"])
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 0205e4043b..3a02ae1b23 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -159,7 +159,7 @@ contains
 
         sim_time = t_step*dt
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 013d91988e..768a886fd8 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -614,7 +614,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         ! Bubbles remain in a fixed position
-        !$acc parallel loop collapse(2) gang vector default(present) private(k) copyin(stage)
+        $:parallel_loop(collapse=2, private=["k"], copyin=["stage"])
         do k = 1, nBubs
             do l = 1, 3
                 mtn_dposdt(k, l, stage) = 0._wp
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 1c7f1a4506..b0b86cc388 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -772,7 +772,12 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
-                !$acc parallel loop collapse(2) gang vector default(present) private(alpha_rho, vel, adv, mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, dadv_dt, dalpha_rho_dt,L, lambda,Ys,dYs_dt,dYs_ds,h_k,Cp_i,Gamma_i,Xs)
+                $:parallel_loop(collapse=2, private=["alpha_rho", "vel", "adv", &
+                                "mf", "dvel_ds", "dadv_ds", "Re_cbc", &
+                                "dalpha_rho_ds","dvel_dt", "dadv_dt", &
+                                "dalpha_rho_dt", "L", "lambda", "Ys", &
+                                "dYs_dt", "dYs_ds", "h_k", "Cp_i", &
+                                "Gamma_i", "Xs"])
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
 
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 56f36e7ca3..5e082b4450 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -405,7 +405,7 @@ contains
             end do
         elseif (p == 0) then
             q = 0
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = 0, n
                 do k = 0, m
                     ! Maximum principal stress
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index deb869c11b..f534f2ca65 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -1496,7 +1496,7 @@ contains
                                                              idwbuff(1), idwbuff(2), idwbuff(3))
                     end if
 
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:parallel_loop(collapse=2)
                     do l = 0, p
                         do j = 0, m
                             !$acc loop seq
@@ -1564,7 +1564,7 @@ contains
                     end do
 
                     if (viscous) then
-                        !$acc parallel loop collapse(2) gang vector default(present)
+                        $:parallel_loop(collapse=2)
                         do l = 0, p
                             do j = 0, m
                                 !$acc loop seq
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 615448e0d3..e790651ad9 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1354,7 +1354,7 @@ contains
             end do
         end if
 
-        !$acc parallel loop collapse(2) gang vector default(present)
+        $:parallel_loop(collapse=2)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 grad_x%sf(idwbuff(1)%beg, k, l) = &
@@ -1366,7 +1366,7 @@ contains
             end do
         end do
         if (n > 0) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
                     grad_y%sf(j, idwbuff(2)%beg, l) = &
@@ -1378,7 +1378,7 @@ contains
                 end do
             end do
             if (p > 0) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:parallel_loop(collapse=2)
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_z%sf(j, k, idwbuff(3)%beg) = &
@@ -1393,7 +1393,7 @@ contains
         end if
 
         if (bc_x%beg <= BC_GHOST_EXTRAPOLATION) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
@@ -1402,7 +1402,7 @@ contains
             end do
         end if
         if (bc_x%end <= BC_GHOST_EXTRAPOLATION) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:parallel_loop(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
@@ -1412,7 +1412,7 @@ contains
         end if
         if (n > 0) then
             if (bc_y%beg <= BC_GHOST_EXTRAPOLATION .and. bc_y%beg /= BC_NULL) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:parallel_loop(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
@@ -1421,7 +1421,7 @@ contains
                 end do
             end if
             if (bc_y%end <= BC_GHOST_EXTRAPOLATION) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:parallel_loop(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
@@ -1431,7 +1431,7 @@ contains
             end if
             if (p > 0) then
                 if (bc_z%beg <= BC_GHOST_EXTRAPOLATION) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:parallel_loop(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, 0) = &
@@ -1441,7 +1441,7 @@ contains
                     end do
                 end if
                 if (bc_z%end <= BC_GHOST_EXTRAPOLATION) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:parallel_loop(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, p) = &

From 4056f84d3846e6dc6a486714b73745036d7bc160 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 3 Jun 2025 17:34:06 -0400
Subject: [PATCH 03/75] Replaced more directives and added reduction option

---
 src/common/include/macros.fpp         | 17 ++++++++++++++---
 src/common/m_phase_change.fpp         |  5 ++++-
 src/common/m_variables_conversion.fpp | 10 ++++++----
 src/simulation/m_acoustic_src.fpp     |  2 +-
 src/simulation/m_body_forces.fpp      |  8 ++++----
 src/simulation/m_bubbles_EE.fpp       | 10 +++++-----
 6 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 1e692f6220..75980a54f8 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -103,7 +103,7 @@
     end if
 #:enddef
 
-#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None)
+#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, reduction=None, reductionOp=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None)
     #:if collapse is not None
         #:assert isinstance(collapse, int)
         #:assert collapse > 0
@@ -148,6 +148,18 @@
         #:set firstprivate_val = ""
     #:endif
 
+    #:if reduction is not None and reductionOp is not None
+        #:assert isinstance(reduction, list)
+        #:assert len(reduction) != 0
+        #:assert all(type(element) == str for element in reduction)
+        #:assert isintance(reductionOp, str)
+        #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
+    #:elif reduction is not None or reductionOp is not None
+        #:stop "Cannot set the reduction list or reduction operation without setting the other"
+    #:else
+        #:set reduction_val = ""
+    #:endif
+
     #:if copy is not None
         #:assert isinstance(copy, list)
         #:assert len(copy) != 0
@@ -189,8 +201,7 @@
         #:set create_val = ""
     #:endif
 
-
-    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + copy_val + copyin_val + copyout_val + create_val
+    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + reduction_val + copy_val + copyin_val + copyout_val + create_val
     !$acc parallel loop ${clause_val}$
 
 #:enddef
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 52565515f1..28439680a3 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -97,7 +97,10 @@ contains
         !$acc declare create(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok)
 
         ! starting equilibrium solver
-        !$acc parallel loop collapse(3) gang vector default(present) private(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF)
+        $:parallel_loop(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
+                        "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
+                        "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
+                        "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
         do j = 0, m
             do k = 0, n
                 do l = 0, p
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 288c2fb0f3..f0c4678f8f 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -883,9 +883,10 @@ contains
             end if
         #:endif
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(alpha_K, alpha_rho_K, Re_K, nRtmp, rho_K, gamma_K, &
-        !$acc pi_inf_K, qv_K, dyn_pres_K, rhoYks, B)
+        
+        $:parallel_loop(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
+                        "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
+                        "dyn_pres_K", "rhoYks", "B"])
         do l = ibounds(3)%beg, ibounds(3)%end
             do k = ibounds(2)%beg, ibounds(2)%end
                 do j = ibounds(1)%beg, ibounds(1)%end
@@ -1480,7 +1481,8 @@ contains
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
-        !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho_K, vel_K, alpha_K, Re_K, Y_K)
+        $:parallel_loop(collapse=3, private=["alpha_rho_K", "vel_K", &
+                        "alpha_K", "Re_K", "Y_K"])
         do l = is3b, is3e
             do k = is2b, is2e
                 do j = is1b, is1e
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 3a02ae1b23..7b5d8a5fee 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -312,7 +312,7 @@ contains
         end do
 
         ! Update the rhs variables
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 1ef74cbcee..7813017011 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -79,7 +79,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer :: i, j, k, l !< standard iterators
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -122,7 +122,7 @@ contains
 
         if (bf_x) then ! x-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -137,7 +137,7 @@ contains
 
         if (bf_y) then ! y-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -152,7 +152,7 @@ contains
 
         if (bf_z) then ! z-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index d2bb02cbc5..cccde36056 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -76,7 +76,7 @@ contains
         real(wp) :: nR3bar
         integer(wp) :: i, j, k, l
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -103,7 +103,7 @@ contains
         if (idir == 1) then
 
             if (.not. qbmm) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -119,7 +119,7 @@ contains
 
         elseif (idir == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -133,7 +133,7 @@ contains
 
         elseif (idir == 3) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -173,7 +173,7 @@ contains
         integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m

From 7a81f7a1623a5cf2824d00bbcaf22364a56855b9 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 3 Jun 2025 18:17:26 -0400
Subject: [PATCH 04/75] Multi-reduction support and more directives replaced

---
 src/common/include/macros.fpp   | 26 +++++++++++++++++++++-----
 src/simulation/m_bubbles_EE.fpp |  7 ++++---
 src/simulation/m_bubbles_EL.fpp | 19 ++++++++++---------
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 75980a54f8..ee90eee5b6 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -149,11 +149,27 @@
     #:endif
 
     #:if reduction is not None and reductionOp is not None
-        #:assert isinstance(reduction, list)
-        #:assert len(reduction) != 0
-        #:assert all(type(element) == str for element in reduction)
-        #:assert isintance(reductionOp, str)
-        #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
+        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
+            #:assert isinstance(reduction, list)
+            #:assert len(reduction) != 0
+            #:assert all(type(element) == list for element in reduction)
+            #:assert all(len(element) != 0 for element in reduction)
+            #:assert all(type(element) == str for sublist in reduction for element in sublist)
+            #:assert isinstance(reductionOp, list)
+            #:assert len(reductionOp) != 0
+            #:assert all(type(element) == str for element in reductionOp)
+            #:assert len(reduction) == len(reductionOp)
+            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
+            #:set reduction_val = " ". join(reduction_list) + " "
+        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
+            #:assert isinstance(reduction, list)
+            #:assert len(reduction) != 0
+            #:assert all(type(element) == str for element in reduction)
+            #:assert isinstance(reductionOp, str)
+            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
+        #:else
+            #:stop "Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively"
+        #:endif
     #:elif reduction is not None or reductionOp is not None
         #:stop "Cannot set the reduction list or reduction operation without setting the other"
     #:else
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index cccde36056..7730d60f39 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -191,8 +191,9 @@ contains
         end do
 
         adap_dt_stop_max = 0
-        !$acc parallel loop collapse(3) gang vector default(present) private(Rtmp, Vtmp, myalpha_rho, myalpha)  &
-        !$acc reduction(MAX:adap_dt_stop_max) copy(adap_dt_stop_max)
+        $:parallel_loop(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
+                        reduction=["adap_dt_stop_max"], reductionOp="MAX", &
+                        copy=["adap_dt_stop_max"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -324,7 +325,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 768a886fd8..a408c11513 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -680,7 +680,7 @@ contains
 
                 call s_gradient_dir(q_prim_vf(E_idx), q_beta%vf(3), l)
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -695,7 +695,7 @@ contains
                 end do
 
                 !source in energy
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do k = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(2)%beg, idwbuff(2)%end
                         do i = idwbuff(1)%beg, idwbuff(1)%end
@@ -706,7 +706,7 @@ contains
 
                 call s_gradient_dir(q_beta%vf(3), q_beta%vf(4), l)
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -781,7 +781,7 @@ contains
                               mtn_s, mtn_pos, q_beta)
 
         !Store 1-beta
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -1290,7 +1290,7 @@ contains
 
         if (dir == 1) then
             ! Gradient in x dir.
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do k = 0, p
                 do j = 0, n
                     do i = 0, m
@@ -1305,7 +1305,7 @@ contains
         else
             if (dir == 2) then
                 ! Gradient in y dir.
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1319,7 +1319,7 @@ contains
                 end do
             else
                 ! Gradient in z dir.
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1413,8 +1413,9 @@ contains
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
-        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:lag_vol,lag_void_avg) &
-        !$acc reduction(MAX:lag_void_max) copy(lag_vol, lag_void_avg, lag_void_max)
+        $:parallel_loop(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
+                        ["lag_void_max"]], reductionOp=["+", "MAX"], &
+                        copy=["lag_vol", "lag_void_avg", "lag_void_max"])
         do k = 0, p
             do j = 0, n
                 do i = 0, m

From 23dfe88d01bf13a12de4b4c4bd2ea2a735a4e3ac Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 3 Jun 2025 20:53:27 -0400
Subject: [PATCH 05/75] Replaced some directives

---
 src/simulation/m_cbc.fpp             | 34 +++++++++---------
 src/simulation/m_fftw.fpp            | 12 +++----
 src/simulation/m_hypoelastic.fpp     | 24 ++++++-------
 src/simulation/m_qbmm.fpp            |  2 +-
 src/simulation/m_rhs.fpp             | 26 +++++++-------
 src/simulation/m_riemann_solvers.fpp | 54 ++++++++++++++--------------
 src/simulation/m_surface_tension.fpp |  8 ++---
 src/simulation/m_viscous.fpp         | 46 ++++++++++++------------
 8 files changed, 103 insertions(+), 103 deletions(-)

diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index b0b86cc388..c8861c04ef 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -698,7 +698,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = 1, flux_cbc_index
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -710,7 +710,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = advxb, advxe
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -1181,7 +1181,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1205,7 +1205,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1228,7 +1228,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1257,7 +1257,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1281,7 +1281,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1304,7 +1304,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1333,7 +1333,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1357,7 +1357,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1380,7 +1380,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1437,7 +1437,7 @@ contains
                     end do
                 end do
             end do
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1460,7 +1460,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1489,7 +1489,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1512,7 +1512,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1542,7 +1542,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1565,7 +1565,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 3c18a8c1fe..14c87440db 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -139,7 +139,7 @@ contains
         if (bc_y%beg >= 0) return
 #if defined(MFC_OpenACC)
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
@@ -148,7 +148,7 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -173,7 +173,7 @@ contains
         Nfq = 3
         !$acc update device(Nfq)
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, Nfq
@@ -191,7 +191,7 @@ contains
 #endif
         !$acc end host_data
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -203,7 +203,7 @@ contains
 
         do i = 1, fourier_rings
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, cmplx_size
@@ -233,7 +233,7 @@ contains
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
             !$acc update device(Nfq)
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, Nfq
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 5e082b4450..f785cbc355 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -104,7 +104,7 @@ contains
             ! calculate velocity gradients + rho_K and G_K
             ! TODO: re-organize these loops one by one for GPU efficiency if possible?
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -114,7 +114,7 @@ contains
             end do
             !$acc end parallel loop
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -130,7 +130,7 @@ contains
             !$acc end parallel loop
 
             if (ndirs > 1) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -140,7 +140,7 @@ contains
                 end do
                 !$acc end parallel loop
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -161,7 +161,7 @@ contains
                 ! 3D
                 if (ndirs == 3) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -172,7 +172,7 @@ contains
                     end do
                     !$acc end parallel loop
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -196,7 +196,7 @@ contains
                 end if
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -220,7 +220,7 @@ contains
             end do
 
             ! apply rhs source term to elastic stress equation
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -234,7 +234,7 @@ contains
             end do
 
         elseif (idir == 2) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -269,7 +269,7 @@ contains
             end do
 
         elseif (idir == 3) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -337,7 +337,7 @@ contains
 
         if (cyl_coord .and. idir == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -419,7 +419,7 @@ contains
                 end do
             end do
         else
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 1c52cee2fa..e69c126e44 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -480,7 +480,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index f534f2ca65..043ece5ca7 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -646,7 +646,7 @@ contains
         ! Converting Conservative to Primitive Variables
 
         if (mpp_lim .and. bubbles_euler) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -867,7 +867,7 @@ contains
         ! END: Dimensional Splitting Loop
 
         if (ib) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -975,7 +975,7 @@ contains
         real(wp) :: advected_qty_val, pressure_val, velocity_val
 
         if (alt_soundspeed) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do q_loop = 0, p
                 do l_loop = 0, n
                     do k_loop = 0, m
@@ -1432,7 +1432,7 @@ contains
         if (idir == 1) then ! x-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1446,7 +1446,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1464,7 +1464,7 @@ contains
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1511,7 +1511,7 @@ contains
 
                 end if
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
@@ -1527,7 +1527,7 @@ contains
                 end do
 
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1548,7 +1548,7 @@ contains
             if (cyl_coord) then
                 if ((bc_y%beg == BC_REFLECTIVE) .or. (bc_y%beg == BC_AXIS)) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
@@ -1578,7 +1578,7 @@ contains
                     end if
                 else
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
@@ -1599,7 +1599,7 @@ contains
         elseif (idir == 3) then ! z-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1613,7 +1613,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1629,7 +1629,7 @@ contains
             end do
 
             if (grid_geometry == 3) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 52a4640524..1cc4651c72 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -3504,7 +3504,7 @@ contains
         if (norm_dir == 1) then
 
             if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3515,7 +3515,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3527,7 +3527,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:parallel_loop(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3539,7 +3539,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            !$acc parallel loop collapse(3) gang vector default(present)
+                            $:parallel_loop(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3559,7 +3559,7 @@ contains
 
             if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3571,7 +3571,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3583,7 +3583,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:parallel_loop(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3595,7 +3595,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            !$acc parallel loop collapse(3) gang vector default(present)
+                            $:parallel_loop(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3618,7 +3618,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3630,7 +3630,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3640,7 +3640,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3651,7 +3651,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:parallel_loop(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3668,7 +3668,7 @@ contains
 
             if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3680,7 +3680,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3690,7 +3690,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3701,7 +3701,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:parallel_loop(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3721,7 +3721,7 @@ contains
         else
 
             if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3732,7 +3732,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3741,7 +3741,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3750,7 +3750,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3765,7 +3765,7 @@ contains
 
             if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3776,7 +3776,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3786,7 +3786,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3796,7 +3796,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -4357,7 +4357,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3%beg, is3%end
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4410,7 +4410,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do j = is1%beg, is1%end
                 do k = is2%beg, is2%end
                     do l = is3%beg, is3%end
@@ -4447,7 +4447,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = is1%beg, is1%end
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 8dac00ee39..5fe98868fd 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -241,7 +241,7 @@ contains
         isx%end = m; isy%end = n; isz%end = p
 
         ! compute gradient components
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -251,7 +251,7 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -262,7 +262,7 @@ contains
         end do
 
         if (p > 0) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -273,7 +273,7 @@ contains
             end do
         end if
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index e790651ad9..1db8b74d09 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -77,7 +77,7 @@ contains
 
         !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
@@ -589,7 +589,7 @@ contains
 
             !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
@@ -604,7 +604,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
@@ -621,7 +621,7 @@ contains
 
             if (n > 0) then
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
@@ -636,7 +636,7 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
@@ -651,7 +651,7 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -670,7 +670,7 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -690,7 +690,7 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
@@ -710,7 +710,7 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:parallel_loop(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
@@ -732,7 +732,7 @@ contains
 
                 if (p > 0) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -748,7 +748,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -764,7 +764,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
@@ -785,7 +785,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
@@ -806,7 +806,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -827,7 +827,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -848,7 +848,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -869,7 +869,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -889,7 +889,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -909,7 +909,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:parallel_loop(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -1206,7 +1206,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -1234,7 +1234,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1262,7 +1262,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:parallel_loop(collapse=3)
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1317,7 +1317,7 @@ contains
 
         !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:parallel_loop(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end

From 76f79e069240005e6b7a86295809d80f6facc94f Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 3 Jun 2025 20:55:46 -0400
Subject: [PATCH 06/75] Replaced more directives

---
 src/simulation/m_body_forces.fpp     |  2 +-
 src/simulation/m_bubbles_EL.fpp      |  6 ++---
 src/simulation/m_cbc.fpp             | 34 ++++++++++++++--------------
 src/simulation/m_rhs.fpp             |  6 ++---
 src/simulation/m_riemann_solvers.fpp | 28 +++++++++++------------
 src/simulation/m_time_steppers.fpp   | 14 ++++++------
 src/simulation/m_viscous.fpp         | 12 +++++-----
 src/simulation/m_weno.fpp            |  6 ++---
 8 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 7813017011..0bbe1ec1b9 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -109,7 +109,7 @@ contains
         call s_compute_acceleration(mytime)
         call s_compute_mixture_density(q_cons_vf)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index a408c11513..2e9aec51bf 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -644,7 +644,7 @@ contains
         if (lag_params%solver_approach == 2) then
 
             if (p == 0) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -660,7 +660,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -766,7 +766,7 @@ contains
 
         call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index c8861c04ef..03e3d2d91e 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -729,7 +729,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = 1, flux_cbc_index
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -749,7 +749,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = advxb, advxe
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -1169,7 +1169,7 @@ contains
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1192,7 +1192,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1216,7 +1216,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1245,7 +1245,7 @@ contains
             ! Reshaping Inputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1268,7 +1268,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1292,7 +1292,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1321,7 +1321,7 @@ contains
             ! Reshaping Inputted Data in z-direction
         else
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1344,7 +1344,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1368,7 +1368,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1425,7 +1425,7 @@ contains
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1448,7 +1448,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1476,7 +1476,7 @@ contains
             ! Reshaping Outputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1500,7 +1500,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1529,7 +1529,7 @@ contains
             ! Reshaping Outputted Data in z-direction
         else
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1553,7 +1553,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 043ece5ca7..e5e8ba9f39 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -592,7 +592,7 @@ contains
             !$acc update device(Res, Re_idx, Re_size)
         end if
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do id = 1, num_dims
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -632,7 +632,7 @@ contains
 
         call cpu_time(t_start)
         ! Association/Population of Working Variables
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -935,7 +935,7 @@ contains
         ! END: Additional physics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 1cc4651c72..a0c4cd50cd 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -3852,7 +3852,7 @@ contains
 
             if (viscous .or. (surface_tension)) then
 
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3866,7 +3866,7 @@ contains
 
             if (qbmm) then
 
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3882,7 +3882,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (viscous .or. (surface_tension)) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -3895,7 +3895,7 @@ contains
             end if
 
             if (qbmm) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3911,7 +3911,7 @@ contains
         else
 
             if (viscous .or. (surface_tension)) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = momxb, E_idx
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -3924,7 +3924,7 @@ contains
             end if
 
             if (qbmm) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -4331,7 +4331,7 @@ contains
 
         ! Reshaping Outputted Data in y-direction
         if (norm_dir == 2) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do j = is1%beg, is1%end
@@ -4344,7 +4344,7 @@ contains
             end do
 
             if (cyl_coord) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4368,7 +4368,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4383,7 +4383,7 @@ contains
             end if
             ! Reshaping Outputted Data in z-direction
         elseif (norm_dir == 3) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4396,7 +4396,7 @@ contains
                 end do
             end do
             if (grid_geometry == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = 1, sys_size
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4421,7 +4421,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb + 1, advxe
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4435,7 +4435,7 @@ contains
 
             end if
         elseif (norm_dir == 1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -4458,7 +4458,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 7dd75f7486..21748e5aad 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -377,7 +377,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -479,7 +479,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -551,7 +551,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -661,7 +661,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -733,7 +733,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -806,7 +806,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -1024,7 +1024,7 @@ contains
         call nvtxStartRange("RHS-BODYFORCES")
         call s_compute_body_forces_rhs(q_prim_vf, q_cons_vf, rhs_vf)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:parallel_loop(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 1db8b74d09..6a84cd63ac 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1019,7 +1019,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1031,7 +1031,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1043,7 +1043,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1118,7 +1118,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1130,7 +1130,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1142,7 +1142,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:parallel_loop(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 39e0b281a8..e390192f6b 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1138,7 +1138,7 @@ contains
         !$acc update device(v_size)
 
         if (weno_dir == 1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:parallel_loop(collapse=4)
             do j = 1, v_size
                 do q = is3_weno%beg, is3_weno%end
                     do l = is2_weno%beg, is2_weno%end
@@ -1176,7 +1176,7 @@ contains
                 end if
             else
 #endif
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1206,7 +1206,7 @@ contains
                 end block
             else
 #endif
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:parallel_loop(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end

From aa9b5c68bee62a7b53c6e4eeb55ed9c16776cc76 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 4 Jun 2025 12:11:48 -0400
Subject: [PATCH 07/75] More directives replaced and formatter

---
 src/common/include/macros.fpp         |  3 +--
 src/common/m_phase_change.fpp         |  6 ++---
 src/common/m_variables_conversion.fpp |  7 +++---
 src/simulation/m_bubbles_EE.fpp       |  4 +--
 src/simulation/m_bubbles_EL.fpp       |  4 +--
 src/simulation/m_cbc.fpp              |  8 +++---
 src/simulation/m_data_output.fpp      |  2 +-
 src/simulation/m_fftw.fpp             |  4 +--
 src/simulation/m_hyperelastic.fpp     |  4 +--
 src/simulation/m_mhd.fpp              |  3 +--
 src/simulation/m_mpi_proxy.fpp        | 36 +++++++++++++--------------
 src/simulation/m_qbmm.fpp             |  5 +++-
 src/simulation/m_rhs.fpp              | 35 +++++++++++++++++---------
 src/simulation/m_time_steppers.fpp    | 24 +++++++++---------
 14 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index ee90eee5b6..70ee3deb93 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -184,7 +184,7 @@
     #:else
         #:set copy_val = ""
     #:endif
-    
+
     #:if copyin is not None
         #:assert isinstance(copyin, list)
         #:assert len(copyin) != 0
@@ -219,5 +219,4 @@
 
     #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + reduction_val + copy_val + copyin_val + copyout_val + create_val
     !$acc parallel loop ${clause_val}$
-
 #:enddef
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 28439680a3..5522484167 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -98,9 +98,9 @@ contains
 
         ! starting equilibrium solver
         $:parallel_loop(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
-                        "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
-                        "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
-                        "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
+            "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
+            "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
+            "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
         do j = 0, m
             do k = 0, n
                 do l = 0, p
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index f0c4678f8f..433616bc95 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -883,10 +883,9 @@ contains
             end if
         #:endif
 
-        
         $:parallel_loop(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
-                        "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
-                        "dyn_pres_K", "rhoYks", "B"])
+            "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
+        "dyn_pres_K", "rhoYks", "B"])
         do l = ibounds(3)%beg, ibounds(3)%end
             do k = ibounds(2)%beg, ibounds(2)%end
                 do j = ibounds(1)%beg, ibounds(1)%end
@@ -1482,7 +1481,7 @@ contains
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
         $:parallel_loop(collapse=3, private=["alpha_rho_K", "vel_K", &
-                        "alpha_K", "Re_K", "Y_K"])
+            "alpha_K", "Re_K", "Y_K"])
         do l = is3b, is3e
             do k = is2b, is2e
                 do j = is1b, is1e
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 7730d60f39..13371d9406 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -192,8 +192,8 @@ contains
 
         adap_dt_stop_max = 0
         $:parallel_loop(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
-                        reduction=["adap_dt_stop_max"], reductionOp="MAX", &
-                        copy=["adap_dt_stop_max"])
+            reduction=["adap_dt_stop_max"], reductionOp="MAX", &
+            copy=["adap_dt_stop_max"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 2e9aec51bf..8567ec97ae 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -1414,8 +1414,8 @@ contains
         lag_void_avg = 0._wp
         lag_vol = 0._wp
         $:parallel_loop(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
-                        ["lag_void_max"]], reductionOp=["+", "MAX"], &
-                        copy=["lag_vol", "lag_void_avg", "lag_void_max"])
+            ["lag_void_max"]], reductionOp=["+", "MAX"], &
+            copy=["lag_vol", "lag_void_avg", "lag_void_max"])
         do k = 0, p
             do j = 0, n
                 do i = 0, m
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 03e3d2d91e..67cdcc13fc 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -773,11 +773,9 @@ contains
 
                 ! FD2 or FD4 of RHS at j = 0
                 $:parallel_loop(collapse=2, private=["alpha_rho", "vel", "adv", &
-                                "mf", "dvel_ds", "dadv_ds", "Re_cbc", &
-                                "dalpha_rho_ds","dvel_dt", "dadv_dt", &
-                                "dalpha_rho_dt", "L", "lambda", "Ys", &
-                                "dYs_dt", "dYs_ds", "h_k", "Cp_i", &
-                                "Gamma_i", "Xs"])
+                    "mf", "dvel_ds", "dadv_ds", "Re_cbc", "dalpha_rho_ds","dvel_dt", &
+                    "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", & 
+                    "dYs_ds", "h_k", "Cp_i", "Gamma_i", "Xs"])
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index b66e13d9c2..b778a0b4a2 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -277,7 +277,7 @@ contains
         integer :: j, k, l
 
         ! Computing Stability Criteria at Current Time-step
-        !$acc parallel loop collapse(3) gang vector default(present) private(vel, alpha, Re)
+        $:parallel_loop(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 14c87440db..e3476b4471 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -212,7 +212,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present) firstprivate(i)
+            $:parallel_loop(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
@@ -251,7 +251,7 @@ contains
 #endif
             !$acc end host_data
 
-            !$acc parallel loop collapse(3) gang vector default(present) firstprivate(i)
+            $:parallel_loop(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index be79e98232..ada0069746 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -106,8 +106,8 @@ contains
         real(wp) :: G
         integer :: j, k, l, i, r
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(alpha_K, alpha_rho_K, &
-        !$acc rho, gamma, pi_inf, qv, G, Re, tensora, tensorb)
+        $:parallel_loop(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
+            "gamma", "pi_inf", "qv", "G", "Re", "tensora", "tensorb"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index f5730b513f..a9d89f8df8 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -76,8 +76,7 @@ contains
         real(wp), dimension(3) :: v, B
         real(wp) :: divB, vdotB
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(v, B)
+        $:parallel_loop(collapse=3, private=["v", "B"])
         do q = 0, p
             do l = 0, n
                 do k = 0, m
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f931227f4d..43f993e35c 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -798,7 +798,7 @@ contains
             if (bc_x%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -853,7 +853,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_x%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = 0, gp_layers - 1
@@ -913,7 +913,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_x%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = -gp_layers, -1
@@ -929,7 +929,7 @@ contains
 
             if (bc_x%beg >= 0) then      ! PBC at the end and beginning
 
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 ! Packing buffer to be sent to bc_x%beg
                 do l = 0, p
                     do k = 0, n
@@ -983,7 +983,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -1041,7 +1041,7 @@ contains
             end if
 
             ! Unpacking buffer received from bc_x%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = m + 1, m + gp_layers
@@ -1061,7 +1061,7 @@ contains
             if (bc_y%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1117,7 +1117,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1179,7 +1179,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_y%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = 0, p
                 do k = -gp_layers, -1
                     do j = -gp_layers, m + gp_layers
@@ -1197,7 +1197,7 @@ contains
             if (bc_y%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1253,7 +1253,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1315,7 +1315,7 @@ contains
 #endif
 
             ! Unpacking buffer received form bc_y%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = 0, p
                 do k = n + 1, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1335,7 +1335,7 @@ contains
             if (bc_z%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1392,7 +1392,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1454,7 +1454,7 @@ contains
 #endif
 
             ! Unpacking buffer from bc_z%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = -gp_layers, -1
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1473,7 +1473,7 @@ contains
             if (bc_z%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1528,7 +1528,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:parallel_loop(collapse=3, private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1590,7 +1590,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_z%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:parallel_loop(collapse=3, private=["r"])
             do l = p + 1, p + gp_layers
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index e69c126e44..1b5e99985b 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -839,7 +839,10 @@ contains
 
         !$acc update device(is1_qbmm, is2_qbmm, is3_qbmm)
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(moms, msum, wght, abscX, abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, n_tait, B_tait, pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T)
+        $:parallel_loop(collapse=3, private=["moms", "msum", "wght", "abscX", &
+            "abscY", "wght_pb", "wght_mv", "wght_ht", "coeff", "ht", "r", "q", &
+            "n_tait", "B_tait", "pres", "rho", "nbub", "c", "alf", "momsum", &
+            "drdt", "drdt2", "chi_vw", "x_vw", "rho_mw", "k_mw", "T_bar", "grad_T"])
         do id3 = is3_qbmm%beg, is3_qbmm%end
             do id2 = is2_qbmm%beg, is2_qbmm%end
                 do id1 = is1_qbmm%beg, is1_qbmm%end
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index e5e8ba9f39..bdefed9491 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -1231,8 +1231,9 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", "local_flux2"])
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx)
@@ -1244,8 +1245,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds",&
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", &
+                                "local_flux2"])
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx)
@@ -1295,8 +1298,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", &
+                                "local_flux2"])
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx)
@@ -1312,8 +1317,10 @@ contains
                                         end if
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", &
+                                "local_flux2"])
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx)
@@ -1372,8 +1379,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", &
+                                "local_flux2"])
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx)
@@ -1385,8 +1394,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                                "local_q_cons_val", "local_k_term_val", &
+                                "local_term_coeff", "local_flux1", &
+                                "local_flux2"])
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx)
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 21748e5aad..4592d4024e 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -392,7 +392,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -409,7 +409,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -494,7 +494,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -511,7 +511,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -566,7 +566,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -584,7 +584,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -676,7 +676,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -693,7 +693,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -748,7 +748,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -766,7 +766,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -821,7 +821,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -839,7 +839,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:parallel_loop(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n

From d3e5ac7d486803d4d180a01106ce1e4ad247ec20 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 4 Jun 2025 13:25:54 -0400
Subject: [PATCH 08/75] More directives replaced

---
 src/simulation/m_rhs.fpp             |  3 +-
 src/simulation/m_riemann_solvers.fpp | 85 ++++++++++++++++------------
 src/simulation/m_surface_tension.fpp | 15 +++--
 src/simulation/m_time_steppers.fpp   |  2 +-
 src/simulation/m_viscous.fpp         | 12 ++--
 5 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index bdefed9491..c628067dfe 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -1695,7 +1695,8 @@ contains
         integer :: i, j, k, l, q, iter !< Generic loop iterators
         integer :: relax !< Relaxation procedure determination variable
 
-        !$acc parallel loop collapse(3) gang vector private(pres_K_init, rho_K_s, alpha_rho, alpha, Re, pres_relax)
+        $:parallel_loop(collapse=3, private=["pres_K_init", "rho_K_s", &
+            "alpha_rho", "alpha", "Re", "pres_relax"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index a0c4cd50cd..1aee1317d8 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -362,15 +362,14 @@ contains
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
-                !$acc parallel loop collapse(3) gang vector default(present)    &
-                !$acc private(alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L,  &
-                !$acc alpha_R, tau_e_L, tau_e_R, G_L, G_R, Re_L, Re_R,          &
-                !$acc rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, Ys_L, Ys_R,     &
-                !$acc xi_field_L, xi_field_R,                                   &
-                !$acc Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR,             &
-                !$acc Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2,                     &
-                !$acc c_fast, pres_mag, B, Ga, vdotB, B2, b4, cm,               &
-                !$acc pcorr, zcoef, vel_L_tmp, vel_R_tmp)
+                $:parallel_loop(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
+                    "vel_L", "vel_R", "alpha_L", "alpha_R", "tau_e_L", "tau_e_R", &
+                    "G_L", "G_R", "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
+                    "s_L", "s_R", "s_S", "Ys_L", "Ys_R", "xi_field_L", "xi_field_R", &
+                    "Cp_iL", "Cp_iR", "Xs_L", "Xs_R", "Gamma_iL", "Gamma_iR", &
+                    "Yi_avg", "Phi_avg", "h_iL", "h_iR", "h_avg_2", "c_fast", &
+                    "pres_mag", "B", "Ga", "vdotB", "B2", "b4", "cm", "pcorr", &
+                    "zcoef", "vel_L_tmp", "vel_R_tmp"])
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
@@ -1296,13 +1295,14 @@ contains
                 ! 6-EQUATION MODEL WITH HLLC
                 if (model_eqns == 3) then
                     !ME3
-
-                    !$acc parallel loop collapse(3) gang vector default(present)                    &
-                    !$acc private(vel_L, vel_R, vel_K_Star, Re_L, Re_R, rho_avg, h_avg, gamma_avg,  &
-                    !$acc s_L, s_R, s_S, vel_avg_rms, alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R,     &
-                    !$acc Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2,   &
-                    !$acc tau_e_L, tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, xi_field_R, pcorr,    &
-                    !$acc zcoef, vel_L_tmp, vel_R_tmp)
+                    $:parallel_loop(collapse=3, private=["vel_L", "vel_R", &
+                        "vel_K_Star", "Re_L", "Re_R", "rho_avg", "h_avg", &
+                        "gamma_avg", "s_L", "s_R", "s_S", "vel_avg_rms", &
+                        "alpha_L", "alpha_R", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
+                        "Gamma_iL", "Gamma_iR", "Cp_iL", "Cp_iR", "Yi_avg", &
+                        "Phi_avg", "h_iL", "h_iR", "h_avg_2", "tau_e_L", &
+                        "tau_e_R", "G_L", "G_R", "flux_ene_e", "xi_field_L", &
+                        "xi_field_R", "pcorr", "zcoef", "vel_L_tmp", "vel_R_tmp"])
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -1728,8 +1728,10 @@ contains
 
                 elseif (model_eqns == 4) then
                     !ME4
-                    !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, &
-                    !$acc rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R)
+                    $:parallel_loop(collapse=3, private=["alpha_rho_L", &
+                        "alpha_rho_R", "vel_L", "vel_R", "alpha_L", "alpha_R", &
+                        "rho_avg", "h_avg", "gamma_avg", "s_L", "s_R", "s_S", &
+                        "vel_avg_rms", "nbub_L", "nbub_R", "ptilde_L", "ptilde_R"])
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -1979,8 +1981,12 @@ contains
                     !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
-                    !$acc parallel loop collapse(3) gang vector default(present) private(R0_L, R0_R, V0_L, V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, vel_R, &
-                    !$acc rho_avg, alpha_L, alpha_R, h_avg, gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp)
+                    $:parallel_loop(collapse=3, private=["R0_L", "R0_R", "V0_L", &
+                        "V0_R", "P0_L", "P0_R", "pbw_L", "pbw_R", "vel_L", &
+                        "vel_R", "rho_avg", "alpha_L", "alpha_R", "h_avg", &
+                        "gamma_avg", "s_L", "s_R", "s_S", "nbub_L", "nbub_R", &
+                        "ptilde_L", "ptilde_R", "vel_avg_rms", "Re_L", "Re_R", &
+                        "pcorr", "zcoef", "vel_L_tmp", "vel_R_tmp"])
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -2446,11 +2452,15 @@ contains
                     !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
-                    !$acc parallel loop collapse(3) gang vector default(present) private(vel_L, vel_R, Re_L, Re_R, &
-                    !$acc rho_avg, h_avg, gamma_avg, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef,   &
-                    !$acc vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR,          &
-                    !$acc tau_e_L, tau_e_R, xi_field_L, xi_field_R,                                                &
-                    !$acc Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2) copyin(is1,is2,is3)
+                    $:parallel_loop(collapse=3, private=["vel_L", "vel_R", &
+                        "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
+                        "alpha_L", "alpha_R", "s_L", "s_R", "s_S", &
+                        "vel_avg_rms", "pcorr", "zcoef", "vel_L_tmp", &
+                        "vel_R_tmp", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
+                        "Gamma_iL", "Gamma_iR", "Cp_iL", "Cp_iR", "tau_e_L", &
+                        "tau_e_R", "xi_field_L", "xi_field_R", "Yi_avg", &
+                        "Phi_avg", "h_iL", "h_iR", "h_avg_2"], &
+                        copyin=["is1", "is2", "is3"])
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -3068,10 +3078,12 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                !$acc parallel loop collapse(3) gang vector default(present) &
-                !$acc private(alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, &
-                !$acc rho, pres, E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, &
-                !$acc U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld)
+                $:parallel_loop(collapse=3, private=["alpha_rho_L", &
+                    "alpha_rho_R", "vel", "alpha_L", "alpha_R", "rho", "pres", &
+                    "E", "H_no_mag", "gamma", "pi_inf", "qv", "vel_rms", "B", &
+                    "c", "c_fast", "pres_mag", "U_L", "U_R", "U_starL", &
+                    "U_starR", "U_doubleL", "U_doubleR", "F_L", "F_R", &
+                    "F_starL", "F_starR", "F_hlld"])
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
@@ -3990,10 +4002,10 @@ contains
         integer :: i_vel             !!< Loop iterator for velocity components.
         integer :: idx_rp(3)         !!< Indices $(j,k,l)$ of 'right' point for averaging.
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(idx_rp, avg_v_int, avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, &
-        !$acc         Re_s, Re_b, vel_src_int, r_eff, divergence_cyl, &
-        !$acc         stress_vector_shear, stress_normal_bulk, div_v_term_const)
+        $:parallel_loop(collapse=3, private=["idx_rp", "avg_v_int", &
+            "avg_dvdx_int", "avg_dvdy_int", "avg_dvdz_int", "Re_s", "Re_b", &
+            "vel_src_int", "r_eff", "divergence_cyl", "stress_vector_shear", &
+            "stress_normal_bulk", "div_v_term_const"])
         do l = iz%beg, iz%end
             do k = iy%beg, iy%end
                 do j = ix%beg, ix%end
@@ -4158,10 +4170,9 @@ contains
 
         real(wp) :: divergence_v   !< Velocity divergence at interface.
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(idx_right_phys, vel_grad_avg, &
-        !$acc current_tau_shear, current_tau_bulk, vel_src_at_interface, &
-        !$acc Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx)
+        $:parallel_loop(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
+            "current_tau_shear", "current_tau_bulk", "vel_src_at_interface", &
+            "Re_shear", "Re_bulk", "divergence_v", "i_dim", "vel_comp_idx"])
         do l_loop = isz%beg, isz%end
             do k_loop = isy%beg, isy%end
                 do j_loop = isx%beg, isx%end
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 5fe98868fd..549b409ac5 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -86,8 +86,9 @@ contains
         integer :: j, k, l, i
 
         if (id == 1) then
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", &
+                "normWR", "normW"])
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -132,8 +133,9 @@ contains
 
         elseif (id == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
+                "normW"])
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -178,8 +180,9 @@ contains
 
         elseif (id == 3) then
 
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
+                "normW"])
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4592d4024e..5bafcd96ed 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -981,7 +981,7 @@ contains
             idwint, &
             gm_alpha_qp%vf)
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(vel, alpha, Re)
+        $:parallel_loop(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 6a84cd63ac..db40bbaaf2 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -89,7 +89,8 @@ contains
             end do
         end do
         if (shear_stress) then    ! Shear stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:parallel_loop(collapse=3, private=["alpha_visc", &
+                "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -196,7 +197,8 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:parallel_loop(collapse=3, private=["alpha_visc", &
+                "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -300,7 +302,8 @@ contains
         if (p == 0) return
 
         if (shear_stress) then    ! Shear stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:parallel_loop(collapse=3, private=["alpha_visc", &
+                "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -408,7 +411,8 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:parallel_loop(collapse=3, private=["alpha_visc", &
+                "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end

From 2e61a77d928d7971a84e27adda7f7e94be6babf0 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 5 Jun 2025 13:28:43 -0400
Subject: [PATCH 09/75] Don't change mpi_proxy as will likely cause merge
 conflicts

---
 src/simulation/m_mpi_proxy.fpp | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 43f993e35c..f931227f4d 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -798,7 +798,7 @@ contains
             if (bc_x%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_x%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -853,7 +853,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_x%beg
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, gp_layers - 1
@@ -913,7 +913,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_x%beg
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = 0, p
                 do k = 0, n
                     do j = -gp_layers, -1
@@ -929,7 +929,7 @@ contains
 
             if (bc_x%beg >= 0) then      ! PBC at the end and beginning
 
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 ! Packing buffer to be sent to bc_x%beg
                 do l = 0, p
                     do k = 0, n
@@ -983,7 +983,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_x%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -1041,7 +1041,7 @@ contains
             end if
 
             ! Unpacking buffer received from bc_x%end
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = 0, p
                 do k = 0, n
                     do j = m + 1, m + gp_layers
@@ -1061,7 +1061,7 @@ contains
             if (bc_y%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_y%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1117,7 +1117,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1179,7 +1179,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_y%beg
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = 0, p
                 do k = -gp_layers, -1
                     do j = -gp_layers, m + gp_layers
@@ -1197,7 +1197,7 @@ contains
             if (bc_y%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1253,7 +1253,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_y%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1315,7 +1315,7 @@ contains
 #endif
 
             ! Unpacking buffer received form bc_y%end
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = 0, p
                 do k = n + 1, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1335,7 +1335,7 @@ contains
             if (bc_z%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_z%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1392,7 +1392,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1454,7 +1454,7 @@ contains
 #endif
 
             ! Unpacking buffer from bc_z%beg
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = -gp_layers, -1
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1473,7 +1473,7 @@ contains
             if (bc_z%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1528,7 +1528,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_z%end
-                $:parallel_loop(collapse=3, private=["r"])
+                !$acc parallel loop collapse(3) gang vector default(present) private(r)
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1590,7 +1590,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_z%end
-            $:parallel_loop(collapse=3, private=["r"])
+            !$acc parallel loop collapse(3) gang vector default(present) private(r)
             do l = p + 1, p + gp_layers
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers

From 891b71488033513cdf46cf689beaae4d7941e7be Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 5 Jun 2025 17:53:30 -0400
Subject: [PATCH 10/75] Made macro captialized and updated macro with DRY

---
 src/common/include/macros.fpp         | 198 ++++++++++++++++----------
 src/common/m_boundary_common.fpp      |  24 ++--
 src/common/m_chemistry.fpp            |   2 +-
 src/common/m_phase_change.fpp         |   2 +-
 src/common/m_variables_conversion.fpp |   4 +-
 src/simulation/m_acoustic_src.fpp     |   4 +-
 src/simulation/m_body_forces.fpp      |  10 +-
 src/simulation/m_bubbles_EE.fpp       |  14 +-
 src/simulation/m_bubbles_EL.fpp       |  24 ++--
 src/simulation/m_cbc.fpp              |  70 ++++-----
 src/simulation/m_data_output.fpp      |   2 +-
 src/simulation/m_fftw.fpp             |  16 +--
 src/simulation/m_hyperelastic.fpp     |   2 +-
 src/simulation/m_hypoelastic.fpp      |  26 ++--
 src/simulation/m_mhd.fpp              |   2 +-
 src/simulation/m_qbmm.fpp             |   4 +-
 src/simulation/m_rhs.fpp              |  50 +++----
 src/simulation/m_riemann_solvers.fpp  |  98 ++++++-------
 src/simulation/m_surface_tension.fpp  |  14 +-
 src/simulation/m_time_steppers.fpp    |  40 +++---
 src/simulation/m_viscous.fpp          |  84 +++++------
 src/simulation/m_weno.fpp             |   6 +-
 22 files changed, 374 insertions(+), 322 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 70ee3deb93..180cf83c71 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -103,68 +103,109 @@
     end if
 #:enddef
 
-#:def parallel_loop(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, reduction=None, reductionOp=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None)
-    #:if collapse is not None
-        #:assert isinstance(collapse, int)
-        #:assert collapse > 0
-        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+#:def ASSERT_LIST(data, datatype)
+    #:assert data is not None
+    #:assert isinstance(data, list)
+    #:assert len(data) != 0
+    #:assert all(isinstance(element, datatype) for element in data)
+#:enddef
+
+#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_list)
+    #:assert isinstance(clause_name, str)
+    #:if clause_list is not None
+        $:ASSERT_LIST(clause_list, str)
+        #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
     #:else
-        #:set collapse_val = ""
+        #:set clause_str = ""
     #:endif
+    $:clause_str
+#:enddef
 
-    #:if private is not None
-        #:assert isinstance(private, list)
-        #:assert len(private) != 0
-        #:assert all(type(element) == str for element in private)
-        #:set private_val = 'private(' + ', '.join(private) + ') '
+#:def GEN_COPY_STR(copy)
+    #:set copy_val = GEN_PARENTHESES_CLAUSE("copy", copy)
+    $:copy_val
+#:enddef
+
+#:def GEN_COPYIN_STR(copyin, copyinReadOnly)
+    #:if copyin is not None
+        $:ASSERT_LIST(copyin, str)
+        #:assert isinstance(copyinReadOnly, bool)
+        #:if copyinReadOnly == True
+            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
+        #:else
+            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
+        #:endif
     #:else
-        #:set private_val = ""
+        #:set copyin_val = ""
     #:endif
+    $:copyin_val
+#:enddef
 
-    #:if default is not None
-        #:assert isinstance(default, str)
-        #:assert (default == "present" or default == "none")
-        #:set default_val = 'default(' + default + ') '
+#:def GEN_COPYOUT_STR(copyout)
+    #:set copyout_val = GEN_PARENTHESES_CLAUSE("copyout", copyout)
+    $:copyout_val
+#:enddef
+
+#:def GEN_CREATE_STR(create)
+    #:set create_val = GEN_PARENTHESES_CLAUSE("create", create)
+    $:create_val
+#:enddef
+
+#:def GEN_EXTRA_ARGS_STR(extraArgs)
+    #:if extraArgs is not None
+        #:assert isinstance(extraArgs, str)
+        #:set extraArgs_val = extraArgs
     #:else
-        #:set default_val = ""
+        #:set extraArgs_val = ""
     #:endif
+    $:extraArgs_val
+#:enddef
 
+#:def GEN_PARALLELISM_STR(parallelism)
     #:if parallelism is not None
-        #:assert isinstance(parallelism, list)
-        #:assert len(parallelism) != 0
-        #:assert all(type(element) == str for element in parallelism)
-        #:assert all((element == "gang" or element == "worker" or element == "vector") for element in parallelism)
+        $:ASSERT_LIST(parallelism, str)
+        #:assert all((element == "gang" or element == "worker" or element == "vector" or element == "seq") for element in parallelism)
         #:set parallelism_val = " ".join(parallelism) + " "
     #:else
         #:set parallelism_val = ""
     #:endif
+    $:parallelism_val
+#:enddef
 
-    #:if firstprivate is not None
-        #:assert isinstance(firstprivate, list)
-        #:assert len(firstprivate) != 0
-        #:assert all(type(element) == str for element in firstprivate)
-        #:set firstprivate_val = 'firstprivate(' + ', '.join(firstprivate) + ') '
+#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, reduction=None, reductionOp=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, extraAccArgs=None)
+    #:if collapse is not None
+        #:assert isinstance(collapse, int)
+        #:assert collapse > 1
+        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+    #:else
+        #:set collapse_val = ""
+    #:endif
+
+    #:set private_val = GEN_PARENTHESES_CLAUSE("private", private)
+
+    #:if default is not None
+        #:assert isinstance(default, str)
+        #:assert (default == "present" or default == "none")
+        #:set default_val = 'default(' + default + ') '
     #:else
-        #:set firstprivate_val = ""
+        #:set default_val = ""
     #:endif
 
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:set firstprivate_val = GEN_PARENTHESES_CLAUSE("firstprivate", firstprivate)
+
     #:if reduction is not None and reductionOp is not None
         #:if isinstance(reduction, list) and isinstance(reductionOp, list)
-            #:assert isinstance(reduction, list)
-            #:assert len(reduction) != 0
-            #:assert all(type(element) == list for element in reduction)
+            $:ASSERT_LIST(reduction, list)
+            $:ASSERT_LIST(reductionOp, str)
             #:assert all(len(element) != 0 for element in reduction)
             #:assert all(type(element) == str for sublist in reduction for element in sublist)
-            #:assert isinstance(reductionOp, list)
-            #:assert len(reductionOp) != 0
-            #:assert all(type(element) == str for element in reductionOp)
             #:assert len(reduction) == len(reductionOp)
             #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
             #:set reduction_val = " ". join(reduction_list) + " "
         #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
-            #:assert isinstance(reduction, list)
-            #:assert len(reduction) != 0
-            #:assert all(type(element) == str for element in reduction)
+            $:ASSERT_LIST(reduction, str)
             #:assert isinstance(reductionOp, str)
             #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
         #:else
@@ -176,47 +217,58 @@
         #:set reduction_val = ""
     #:endif
 
-    #:if copy is not None
-        #:assert isinstance(copy, list)
-        #:assert len(copy) != 0
-        #:assert all(type(element) == str for element in copy)
-        #:set copy_val = 'copy(' + ', '.join(copy) + ') '
-    #:else
-        #:set copy_val = ""
-    #:endif
+    #:set copy_val = GEN_COPY_STR(copy)
 
-    #:if copyin is not None
-        #:assert isinstance(copyin, list)
-        #:assert len(copyin) != 0
-        #:assert all(type(element) == str for element in copyin)
-        #:assert isinstance(copyinReadOnly, bool)
-        #:if copyinReadOnly == True
-            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
-        #:else
-            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
-        #:endif
-    #:else
-        #:set copyin_val = ""
-    #:endif
+    #:set copyin_val = GEN_COPYIN_STR(copyin, copyinReadOnly)
 
-    #:if copyout is not None
-        #:assert isinstance(copyout, list)
-        #:assert len(copyout) != 0
-        #:assert all(type(element) == str for element in copyout)
-        #:set copyout_val = 'copyout(' + ', '.join(copyout) + ') '
-    #:else
-        #:set copyout_val = ""
-    #:endif
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
 
-    #:if create is not None
-        #:assert isinstance(create, list)
-        #:assert len(create) != 0
-        #:assert all(type(element) == str for element in create)
-        #:set create_val = 'create(' + ', '.join(create) + ') '
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + reduction_val + copy_val + copyin_val + copyout_val + create_val
+    #:set acc_directive = '!$acc parallel loop ' + clause_val + extraAccArgs_val
+    $:acc_directive
+#:enddef
+
+#:def routine(parallelism=["seq"], nohost=False, extraAccArgs=None)
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:assert isinstance(nohost, bool)
+    #:if nohost == True
+        #:set nohost_val = "nohost"
     #:else
-        #:set create_val = ""
+        #:set nohost_val = ""
     #:endif
 
-    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + reduction_val + copy_val + copyin_val + copyout_val + create_val
-    !$acc parallel loop ${clause_val}$
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = parallelism_val + nohost_val
+    #:set acc_directive = '!$acc routine ' + clause_val + extraAccArgs_val
+    $:acc_directive
+#:enddef
+
+
+
+#:def declare(copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, copyinReadOnly)
+
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set present_val = GEN_PARENTHESES_CLAUSE("present", present)
+
+    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE("deviceptr", deviceptr)
+
+    #:set link_val = GEN_PARENTHESES_CLAUSE("link", link)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copy_val + copyin_val + copyout_val + create_val + present_val + deviceptr_val + link_val
+    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val
+    $:acc_directive
 #:enddef
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 1432e53a7c..3b8732c593 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -83,7 +83,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, -1)%sf(0, k, l)))
@@ -107,7 +107,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, 1)%sf(0, k, l)))
@@ -135,7 +135,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, -1)%sf(k, 0, l)))
@@ -161,7 +161,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, 1)%sf(k, 0, l)))
@@ -189,7 +189,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, -1)%sf(k, l, 0)))
@@ -213,7 +213,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, 1)%sf(k, l, 0)))
@@ -1167,7 +1167,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, -1)%sf(0, k, l))
@@ -1185,7 +1185,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, 1)%sf(0, k, l))
@@ -1206,7 +1206,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, -1)%sf(k, 0, l))
@@ -1224,7 +1224,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, 1)%sf(k, 0, l))
@@ -1245,7 +1245,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, -1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, -1)%sf(k, l, 0))
@@ -1263,7 +1263,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, 1)
         else
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, 1)%sf(k, l, 0))
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index ad3e3c7187..416ecc63cc 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -99,7 +99,7 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        $:parallel_loop(collapse=3, private=["Ys", "omega"])
+        $:PARALLEL_LOOP(collapse=3, private=["Ys", "omega"])
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 5522484167..48a2b748a0 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -97,7 +97,7 @@ contains
         !$acc declare create(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok)
 
         ! starting equilibrium solver
-        $:parallel_loop(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
+        $:PARALLEL_LOOP(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
             "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
             "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
             "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 433616bc95..0f84d45222 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -883,7 +883,7 @@ contains
             end if
         #:endif
 
-        $:parallel_loop(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
+        $:PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
             "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
         "dyn_pres_K", "rhoYks", "B"])
         do l = ibounds(3)%beg, ibounds(3)%end
@@ -1480,7 +1480,7 @@ contains
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
-        $:parallel_loop(collapse=3, private=["alpha_rho_K", "vel_K", &
+        $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_K", "vel_K", &
             "alpha_K", "Re_K", "Y_K"])
         do l = is3b, is3e
             do k = is2b, is2e
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 7b5d8a5fee..a1bc61efe8 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -159,7 +159,7 @@ contains
 
         sim_time = t_step*dt
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -312,7 +312,7 @@ contains
         end do
 
         ! Update the rhs variables
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 0bbe1ec1b9..b0daf8dd50 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -79,7 +79,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer :: i, j, k, l !< standard iterators
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -109,7 +109,7 @@ contains
         call s_compute_acceleration(mytime)
         call s_compute_mixture_density(q_cons_vf)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
@@ -122,7 +122,7 @@ contains
 
         if (bf_x) then ! x-direction body forces
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -137,7 +137,7 @@ contains
 
         if (bf_y) then ! y-direction body forces
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -152,7 +152,7 @@ contains
 
         if (bf_z) then ! z-direction body forces
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 13371d9406..ca318f8cee 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -76,7 +76,7 @@ contains
         real(wp) :: nR3bar
         integer(wp) :: i, j, k, l
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -103,7 +103,7 @@ contains
         if (idir == 1) then
 
             if (.not. qbmm) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -119,7 +119,7 @@ contains
 
         elseif (idir == 2) then
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -133,7 +133,7 @@ contains
 
         elseif (idir == 3) then
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -173,7 +173,7 @@ contains
         integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -191,7 +191,7 @@ contains
         end do
 
         adap_dt_stop_max = 0
-        $:parallel_loop(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
+        $:PARALLEL_LOOP(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
             reduction=["adap_dt_stop_max"], reductionOp="MAX", &
             copy=["adap_dt_stop_max"])
         do l = 0, p
@@ -325,7 +325,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 8567ec97ae..5b6c818231 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -614,7 +614,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         ! Bubbles remain in a fixed position
-        $:parallel_loop(collapse=2, private=["k"], copyin=["stage"])
+        $:PARALLEL_LOOP(collapse=2, private=["k"], copyin=["stage"])
         do k = 1, nBubs
             do l = 1, 3
                 mtn_dposdt(k, l, stage) = 0._wp
@@ -644,7 +644,7 @@ contains
         if (lag_params%solver_approach == 2) then
 
             if (p == 0) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -660,7 +660,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -680,7 +680,7 @@ contains
 
                 call s_gradient_dir(q_prim_vf(E_idx), q_beta%vf(3), l)
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -695,7 +695,7 @@ contains
                 end do
 
                 !source in energy
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do k = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(2)%beg, idwbuff(2)%end
                         do i = idwbuff(1)%beg, idwbuff(1)%end
@@ -706,7 +706,7 @@ contains
 
                 call s_gradient_dir(q_beta%vf(3), q_beta%vf(4), l)
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -766,7 +766,7 @@ contains
 
         call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -781,7 +781,7 @@ contains
                               mtn_s, mtn_pos, q_beta)
 
         !Store 1-beta
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -1290,7 +1290,7 @@ contains
 
         if (dir == 1) then
             ! Gradient in x dir.
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do k = 0, p
                 do j = 0, n
                     do i = 0, m
@@ -1305,7 +1305,7 @@ contains
         else
             if (dir == 2) then
                 ! Gradient in y dir.
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1319,7 +1319,7 @@ contains
                 end do
             else
                 ! Gradient in z dir.
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1413,7 +1413,7 @@ contains
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
-        $:parallel_loop(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
+        $:PARALLEL_LOOP(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
             ["lag_void_max"]], reductionOp=["+", "MAX"], &
             copy=["lag_vol", "lag_void_avg", "lag_void_max"])
         do k = 0, p
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 67cdcc13fc..0235f71584 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -698,7 +698,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = 1, flux_cbc_index
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -710,7 +710,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = advxb, advxe
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -729,7 +729,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = 1, flux_cbc_index
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -749,7 +749,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = advxb, advxe
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -772,7 +772,7 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
-                $:parallel_loop(collapse=2, private=["alpha_rho", "vel", "adv", &
+                $:PARALLEL_LOOP(collapse=2, private=["alpha_rho", "vel", "adv", &
                     "mf", "dvel_ds", "dadv_ds", "Re_cbc", "dalpha_rho_ds","dvel_dt", &
                     "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", & 
                     "dYs_ds", "h_k", "Cp_i", "Gamma_i", "Xs"])
@@ -1167,7 +1167,7 @@ contains
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1179,7 +1179,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1190,7 +1190,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1203,7 +1203,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1214,7 +1214,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1226,7 +1226,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1243,7 +1243,7 @@ contains
             ! Reshaping Inputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1255,7 +1255,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1266,7 +1266,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1279,7 +1279,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1290,7 +1290,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1302,7 +1302,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1319,7 +1319,7 @@ contains
             ! Reshaping Inputted Data in z-direction
         else
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1331,7 +1331,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1342,7 +1342,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1355,7 +1355,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1366,7 +1366,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1378,7 +1378,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1423,7 +1423,7 @@ contains
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1435,7 +1435,7 @@ contains
                     end do
                 end do
             end do
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1446,7 +1446,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1458,7 +1458,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1474,7 +1474,7 @@ contains
             ! Reshaping Outputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1487,7 +1487,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1498,7 +1498,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1510,7 +1510,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1527,7 +1527,7 @@ contains
             ! Reshaping Outputted Data in z-direction
         else
 
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1540,7 +1540,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1551,7 +1551,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1563,7 +1563,7 @@ contains
                     end do
                 end do
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index b778a0b4a2..3d87014ea1 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -277,7 +277,7 @@ contains
         integer :: j, k, l
 
         ! Computing Stability Criteria at Current Time-step
-        $:parallel_loop(collapse=3, private=["vel", "alpha", "Re"])
+        $:PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index e3476b4471..0a32961dce 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -139,7 +139,7 @@ contains
         if (bc_y%beg >= 0) return
 #if defined(MFC_OpenACC)
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
@@ -148,7 +148,7 @@ contains
             end do
         end do
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -173,7 +173,7 @@ contains
         Nfq = 3
         !$acc update device(Nfq)
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, Nfq
@@ -191,7 +191,7 @@ contains
 #endif
         !$acc end host_data
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -203,7 +203,7 @@ contains
 
         do i = 1, fourier_rings
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, cmplx_size
@@ -212,7 +212,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3, firstprivate=["i"])
+            $:PARALLEL_LOOP(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
@@ -233,7 +233,7 @@ contains
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
             !$acc update device(Nfq)
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, Nfq
@@ -251,7 +251,7 @@ contains
 #endif
             !$acc end host_data
 
-            $:parallel_loop(collapse=3, firstprivate=["i"])
+            $:PARALLEL_LOOP(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index ada0069746..e1a5d34a27 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -106,7 +106,7 @@ contains
         real(wp) :: G
         integer :: j, k, l, i, r
 
-        $:parallel_loop(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
+        $:PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
             "gamma", "pi_inf", "qv", "G", "Re", "tensora", "tensorb"])
         do l = 0, p
             do k = 0, n
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index f785cbc355..b298833cf5 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -104,7 +104,7 @@ contains
             ! calculate velocity gradients + rho_K and G_K
             ! TODO: re-organize these loops one by one for GPU efficiency if possible?
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -114,7 +114,7 @@ contains
             end do
             !$acc end parallel loop
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -130,7 +130,7 @@ contains
             !$acc end parallel loop
 
             if (ndirs > 1) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -140,7 +140,7 @@ contains
                 end do
                 !$acc end parallel loop
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -161,7 +161,7 @@ contains
                 ! 3D
                 if (ndirs == 3) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -172,7 +172,7 @@ contains
                     end do
                     !$acc end parallel loop
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -196,7 +196,7 @@ contains
                 end if
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -220,7 +220,7 @@ contains
             end do
 
             ! apply rhs source term to elastic stress equation
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -234,7 +234,7 @@ contains
             end do
 
         elseif (idir == 2) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -269,7 +269,7 @@ contains
             end do
 
         elseif (idir == 3) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -337,7 +337,7 @@ contains
 
         if (cyl_coord .and. idir == 2) then
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -405,7 +405,7 @@ contains
             end do
         elseif (p == 0) then
             q = 0
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = 0, n
                 do k = 0, m
                     ! Maximum principal stress
@@ -419,7 +419,7 @@ contains
                 end do
             end do
         else
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index a9d89f8df8..9b87a5a3be 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -76,7 +76,7 @@ contains
         real(wp), dimension(3) :: v, B
         real(wp) :: divB, vdotB
 
-        $:parallel_loop(collapse=3, private=["v", "B"])
+        $:PARALLEL_LOOP(collapse=3, private=["v", "B"])
         do q = 0, p
             do l = 0, n
                 do k = 0, m
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 1b5e99985b..924a0ed758 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -480,7 +480,7 @@ contains
                 end do
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
@@ -839,7 +839,7 @@ contains
 
         !$acc update device(is1_qbmm, is2_qbmm, is3_qbmm)
 
-        $:parallel_loop(collapse=3, private=["moms", "msum", "wght", "abscX", &
+        $:PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
             "abscY", "wght_pb", "wght_mv", "wght_ht", "coeff", "ht", "r", "q", &
             "n_tait", "B_tait", "pres", "rho", "nbub", "c", "alf", "momsum", &
             "drdt", "drdt2", "chi_vw", "x_vw", "rho_mw", "k_mw", "T_bar", "grad_T"])
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index c628067dfe..87c362aaea 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -592,7 +592,7 @@ contains
             !$acc update device(Res, Re_idx, Re_size)
         end if
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do id = 1, num_dims
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -632,7 +632,7 @@ contains
 
         call cpu_time(t_start)
         ! Association/Population of Working Variables
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -646,7 +646,7 @@ contains
         ! Converting Conservative to Primitive Variables
 
         if (mpp_lim .and. bubbles_euler) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -867,7 +867,7 @@ contains
         ! END: Dimensional Splitting Loop
 
         if (ib) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -935,7 +935,7 @@ contains
         ! END: Additional physics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -975,7 +975,7 @@ contains
         real(wp) :: advected_qty_val, pressure_val, velocity_val
 
         if (alt_soundspeed) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do q_loop = 0, p
                 do l_loop = 0, n
                     do k_loop = 0, m
@@ -1231,7 +1231,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", "local_flux2"])
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
@@ -1245,7 +1245,7 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:parallel_loop(collapse=3, private=["local_inv_ds",&
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds",&
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1298,7 +1298,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1317,7 +1317,7 @@ contains
                                         end if
                                     end do; end do; end do
 
-                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1379,7 +1379,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1394,7 +1394,7 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:parallel_loop(collapse=3, private=["local_inv_ds", &
+                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1443,7 +1443,7 @@ contains
         if (idir == 1) then ! x-direction
 
             if (surface_tension) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1457,7 +1457,7 @@ contains
                 end do
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1475,7 +1475,7 @@ contains
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1507,7 +1507,7 @@ contains
                                                              idwbuff(1), idwbuff(2), idwbuff(3))
                     end if
 
-                    $:parallel_loop(collapse=2)
+                    $:PARALLEL_LOOP(collapse=2)
                     do l = 0, p
                         do j = 0, m
                             !$acc loop seq
@@ -1522,7 +1522,7 @@ contains
 
                 end if
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
@@ -1538,7 +1538,7 @@ contains
                 end do
 
             else
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1559,7 +1559,7 @@ contains
             if (cyl_coord) then
                 if ((bc_y%beg == BC_REFLECTIVE) .or. (bc_y%beg == BC_AXIS)) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
@@ -1575,7 +1575,7 @@ contains
                     end do
 
                     if (viscous) then
-                        $:parallel_loop(collapse=2)
+                        $:PARALLEL_LOOP(collapse=2)
                         do l = 0, p
                             do j = 0, m
                                 !$acc loop seq
@@ -1589,7 +1589,7 @@ contains
                     end if
                 else
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
@@ -1610,7 +1610,7 @@ contains
         elseif (idir == 3) then ! z-direction
 
             if (surface_tension) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1624,7 +1624,7 @@ contains
                 end do
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1640,7 +1640,7 @@ contains
             end do
 
             if (grid_geometry == 3) then
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1695,7 +1695,7 @@ contains
         integer :: i, j, k, l, q, iter !< Generic loop iterators
         integer :: relax !< Relaxation procedure determination variable
 
-        $:parallel_loop(collapse=3, private=["pres_K_init", "rho_K_s", &
+        $:PARALLEL_LOOP(collapse=3, private=["pres_K_init", "rho_K_s", &
             "alpha_rho", "alpha", "Re", "pres_relax"])
         do l = 0, p
             do k = 0, n
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 1aee1317d8..594e0009e3 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -362,7 +362,7 @@ contains
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
-                $:parallel_loop(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
+                $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
                     "vel_L", "vel_R", "alpha_L", "alpha_R", "tau_e_L", "tau_e_R", &
                     "G_L", "G_R", "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
                     "s_L", "s_R", "s_S", "Ys_L", "Ys_R", "xi_field_L", "xi_field_R", &
@@ -1295,7 +1295,7 @@ contains
                 ! 6-EQUATION MODEL WITH HLLC
                 if (model_eqns == 3) then
                     !ME3
-                    $:parallel_loop(collapse=3, private=["vel_L", "vel_R", &
+                    $:PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
                         "vel_K_Star", "Re_L", "Re_R", "rho_avg", "h_avg", &
                         "gamma_avg", "s_L", "s_R", "s_S", "vel_avg_rms", &
                         "alpha_L", "alpha_R", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
@@ -1728,7 +1728,7 @@ contains
 
                 elseif (model_eqns == 4) then
                     !ME4
-                    $:parallel_loop(collapse=3, private=["alpha_rho_L", &
+                    $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
                         "alpha_rho_R", "vel_L", "vel_R", "alpha_L", "alpha_R", &
                         "rho_avg", "h_avg", "gamma_avg", "s_L", "s_R", "s_S", &
                         "vel_avg_rms", "nbub_L", "nbub_R", "ptilde_L", "ptilde_R"])
@@ -1981,7 +1981,7 @@ contains
                     !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
-                    $:parallel_loop(collapse=3, private=["R0_L", "R0_R", "V0_L", &
+                    $:PARALLEL_LOOP(collapse=3, private=["R0_L", "R0_R", "V0_L", &
                         "V0_R", "P0_L", "P0_R", "pbw_L", "pbw_R", "vel_L", &
                         "vel_R", "rho_avg", "alpha_L", "alpha_R", "h_avg", &
                         "gamma_avg", "s_L", "s_R", "s_S", "nbub_L", "nbub_R", &
@@ -2452,7 +2452,7 @@ contains
                     !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
-                    $:parallel_loop(collapse=3, private=["vel_L", "vel_R", &
+                    $:PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
                         "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
                         "alpha_L", "alpha_R", "s_L", "s_R", "s_S", &
                         "vel_avg_rms", "pcorr", "zcoef", "vel_L_tmp", &
@@ -3078,7 +3078,7 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                $:parallel_loop(collapse=3, private=["alpha_rho_L", &
+                $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
                     "alpha_rho_R", "vel", "alpha_L", "alpha_R", "rho", "pres", &
                     "E", "H_no_mag", "gamma", "pi_inf", "qv", "vel_rms", "B", &
                     "c", "c_fast", "pres_mag", "U_L", "U_R", "U_starL", &
@@ -3516,7 +3516,7 @@ contains
         if (norm_dir == 1) then
 
             if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3527,7 +3527,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3539,7 +3539,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        $:parallel_loop(collapse=3)
+                        $:PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3551,7 +3551,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            $:parallel_loop(collapse=3)
+                            $:PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3571,7 +3571,7 @@ contains
 
             if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3583,7 +3583,7 @@ contains
 
                 if (viscous) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3595,7 +3595,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        $:parallel_loop(collapse=3)
+                        $:PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3607,7 +3607,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            $:parallel_loop(collapse=3)
+                            $:PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3630,7 +3630,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3642,7 +3642,7 @@ contains
 
                 if (viscous) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3652,7 +3652,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3663,7 +3663,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        $:parallel_loop(collapse=3)
+                        $:PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3680,7 +3680,7 @@ contains
 
             if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3692,7 +3692,7 @@ contains
 
                 if (viscous) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3702,7 +3702,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3713,7 +3713,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        $:parallel_loop(collapse=3)
+                        $:PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3733,7 +3733,7 @@ contains
         else
 
             if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3744,7 +3744,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3753,7 +3753,7 @@ contains
                             end do
                         end do
                     end do
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3762,7 +3762,7 @@ contains
                             end do
                         end do
                     end do
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3777,7 +3777,7 @@ contains
 
             if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3788,7 +3788,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3798,7 +3798,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3808,7 +3808,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3864,7 +3864,7 @@ contains
 
             if (viscous .or. (surface_tension)) then
 
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3878,7 +3878,7 @@ contains
 
             if (qbmm) then
 
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3894,7 +3894,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (viscous .or. (surface_tension)) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -3907,7 +3907,7 @@ contains
             end if
 
             if (qbmm) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3923,7 +3923,7 @@ contains
         else
 
             if (viscous .or. (surface_tension)) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -3936,7 +3936,7 @@ contains
             end if
 
             if (qbmm) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -4002,7 +4002,7 @@ contains
         integer :: i_vel             !!< Loop iterator for velocity components.
         integer :: idx_rp(3)         !!< Indices $(j,k,l)$ of 'right' point for averaging.
 
-        $:parallel_loop(collapse=3, private=["idx_rp", "avg_v_int", &
+        $:PARALLEL_LOOP(collapse=3, private=["idx_rp", "avg_v_int", &
             "avg_dvdx_int", "avg_dvdy_int", "avg_dvdz_int", "Re_s", "Re_b", &
             "vel_src_int", "r_eff", "divergence_cyl", "stress_vector_shear", &
             "stress_normal_bulk", "div_v_term_const"])
@@ -4170,7 +4170,7 @@ contains
 
         real(wp) :: divergence_v   !< Velocity divergence at interface.
 
-        $:parallel_loop(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
+        $:PARALLEL_LOOP(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
             "current_tau_shear", "current_tau_bulk", "vel_src_at_interface", &
             "Re_shear", "Re_bulk", "divergence_v", "i_dim", "vel_comp_idx"])
         do l_loop = isz%beg, isz%end
@@ -4342,7 +4342,7 @@ contains
 
         ! Reshaping Outputted Data in y-direction
         if (norm_dir == 2) then
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do j = is1%beg, is1%end
@@ -4355,7 +4355,7 @@ contains
             end do
 
             if (cyl_coord) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4368,7 +4368,7 @@ contains
                 end do
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4379,7 +4379,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4394,7 +4394,7 @@ contains
             end if
             ! Reshaping Outputted Data in z-direction
         elseif (norm_dir == 3) then
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4407,7 +4407,7 @@ contains
                 end do
             end do
             if (grid_geometry == 3) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4421,7 +4421,7 @@ contains
                 end do
             end if
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do j = is1%beg, is1%end
                 do k = is2%beg, is2%end
                     do l = is3%beg, is3%end
@@ -4432,7 +4432,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4446,7 +4446,7 @@ contains
 
             end if
         elseif (norm_dir == 1) then
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -4458,7 +4458,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = is1%beg, is1%end
@@ -4469,7 +4469,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 549b409ac5..8b4d3d923f 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -86,7 +86,7 @@ contains
         integer :: j, k, l, i
 
         if (id == 1) then
-            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", &
                 "normWR", "normW"])
             do l = isz%beg, isz%end
@@ -133,7 +133,7 @@ contains
 
         elseif (id == 2) then
 
-            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
                 "normW"])
             do l = isz%beg, isz%end
@@ -180,7 +180,7 @@ contains
 
         elseif (id == 3) then
 
-            $:parallel_loop(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
                 "normW"])
             do l = isz%beg, isz%end
@@ -244,7 +244,7 @@ contains
         isx%end = m; isy%end = n; isz%end = p
 
         ! compute gradient components
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -254,7 +254,7 @@ contains
             end do
         end do
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -265,7 +265,7 @@ contains
         end do
 
         if (p > 0) then
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -276,7 +276,7 @@ contains
             end do
         end if
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 5bafcd96ed..7ee08b1223 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -377,7 +377,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -392,7 +392,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -409,7 +409,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -479,7 +479,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -494,7 +494,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -511,7 +511,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -551,7 +551,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -566,7 +566,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -584,7 +584,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -661,7 +661,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -676,7 +676,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -693,7 +693,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -733,7 +733,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -748,7 +748,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -766,7 +766,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -806,7 +806,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -821,7 +821,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -839,7 +839,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:parallel_loop(collapse=5)
+            $:PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -981,7 +981,7 @@ contains
             idwint, &
             gm_alpha_qp%vf)
 
-        $:parallel_loop(collapse=3, private=["vel", "alpha", "Re"])
+        $:PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -1024,7 +1024,7 @@ contains
         call nvtxStartRange("RHS-BODYFORCES")
         call s_compute_body_forces_rhs(q_prim_vf, q_cons_vf, rhs_vf)
 
-        $:parallel_loop(collapse=4)
+        $:PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index db40bbaaf2..244230c01f 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -77,7 +77,7 @@ contains
 
         !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
@@ -89,7 +89,7 @@ contains
             end do
         end do
         if (shear_stress) then    ! Shear stresses
-            $:parallel_loop(collapse=3, private=["alpha_visc", &
+            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
@@ -197,7 +197,7 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:parallel_loop(collapse=3, private=["alpha_visc", &
+            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
@@ -302,7 +302,7 @@ contains
         if (p == 0) return
 
         if (shear_stress) then    ! Shear stresses
-            $:parallel_loop(collapse=3, private=["alpha_visc", &
+            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
@@ -411,7 +411,7 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:parallel_loop(collapse=3, private=["alpha_visc", &
+            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
@@ -593,7 +593,7 @@ contains
 
             !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
@@ -608,7 +608,7 @@ contains
                 end do
             end do
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
@@ -625,7 +625,7 @@ contains
 
             if (n > 0) then
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
@@ -640,7 +640,7 @@ contains
                     end do
                 end do
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
@@ -655,7 +655,7 @@ contains
                     end do
                 end do
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -674,7 +674,7 @@ contains
                     end do
                 end do
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -694,7 +694,7 @@ contains
                     end do
                 end do
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
@@ -714,7 +714,7 @@ contains
                     end do
                 end do
 
-                $:parallel_loop(collapse=3)
+                $:PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
@@ -736,7 +736,7 @@ contains
 
                 if (p > 0) then
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -752,7 +752,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -768,7 +768,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
@@ -789,7 +789,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
@@ -810,7 +810,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
@@ -831,7 +831,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -852,7 +852,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -873,7 +873,7 @@ contains
                         end do
                     end do
 
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
@@ -893,7 +893,7 @@ contains
                             end do
                         end do
                     end do
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -913,7 +913,7 @@ contains
                             end do
                         end do
                     end do
-                    $:parallel_loop(collapse=3)
+                    $:PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -1023,7 +1023,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1035,7 +1035,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1047,7 +1047,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1122,7 +1122,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1134,7 +1134,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1146,7 +1146,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    $:parallel_loop(collapse=4)
+                    $:PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1210,7 +1210,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
@@ -1238,7 +1238,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1266,7 +1266,7 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:parallel_loop(collapse=3)
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1321,7 +1321,7 @@ contains
 
         !$acc update device(is1_viscous, is2_viscous, is3_viscous)
 
-        $:parallel_loop(collapse=3)
+        $:PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
@@ -1358,7 +1358,7 @@ contains
             end do
         end if
 
-        $:parallel_loop(collapse=2)
+        $:PARALLEL_LOOP(collapse=2)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 grad_x%sf(idwbuff(1)%beg, k, l) = &
@@ -1370,7 +1370,7 @@ contains
             end do
         end do
         if (n > 0) then
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
                     grad_y%sf(j, idwbuff(2)%beg, l) = &
@@ -1382,7 +1382,7 @@ contains
                 end do
             end do
             if (p > 0) then
-                $:parallel_loop(collapse=2)
+                $:PARALLEL_LOOP(collapse=2)
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_z%sf(j, k, idwbuff(3)%beg) = &
@@ -1397,7 +1397,7 @@ contains
         end if
 
         if (bc_x%beg <= BC_GHOST_EXTRAPOLATION) then
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
@@ -1406,7 +1406,7 @@ contains
             end do
         end if
         if (bc_x%end <= BC_GHOST_EXTRAPOLATION) then
-            $:parallel_loop(collapse=2)
+            $:PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
@@ -1416,7 +1416,7 @@ contains
         end if
         if (n > 0) then
             if (bc_y%beg <= BC_GHOST_EXTRAPOLATION .and. bc_y%beg /= BC_NULL) then
-                $:parallel_loop(collapse=2)
+                $:PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
@@ -1425,7 +1425,7 @@ contains
                 end do
             end if
             if (bc_y%end <= BC_GHOST_EXTRAPOLATION) then
-                $:parallel_loop(collapse=2)
+                $:PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
@@ -1435,7 +1435,7 @@ contains
             end if
             if (p > 0) then
                 if (bc_z%beg <= BC_GHOST_EXTRAPOLATION) then
-                    $:parallel_loop(collapse=2)
+                    $:PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, 0) = &
@@ -1445,7 +1445,7 @@ contains
                     end do
                 end if
                 if (bc_z%end <= BC_GHOST_EXTRAPOLATION) then
-                    $:parallel_loop(collapse=2)
+                    $:PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, p) = &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index e390192f6b..ad9551e096 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1138,7 +1138,7 @@ contains
         !$acc update device(v_size)
 
         if (weno_dir == 1) then
-            $:parallel_loop(collapse=4)
+            $:PARALLEL_LOOP(collapse=4)
             do j = 1, v_size
                 do q = is3_weno%beg, is3_weno%end
                     do l = is2_weno%beg, is2_weno%end
@@ -1176,7 +1176,7 @@ contains
                 end if
             else
 #endif
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1206,7 +1206,7 @@ contains
                 end block
             else
 #endif
-                $:parallel_loop(collapse=4)
+                $:PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end

From 5922cb2cb3c3548c39b1d77d8390b59f97526376 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 5 Jun 2025 19:32:15 -0400
Subject: [PATCH 11/75] Fixed issues with macro

---
 src/common/include/macros.fpp         | 76 ++++++++++++++++-----------
 src/common/m_finite_differences.fpp   |  4 +-
 src/common/m_variables_conversion.fpp |  2 +-
 src/simulation/m_cbc.fpp              |  2 +-
 4 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 180cf83c71..5e68d30b46 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -116,13 +116,13 @@
         $:ASSERT_LIST(clause_list, str)
         #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
     #:else
-        #:set clause_str = ""
+        #:set clause_str = ''
     #:endif
     $:clause_str
 #:enddef
 
 #:def GEN_COPY_STR(copy)
-    #:set copy_val = GEN_PARENTHESES_CLAUSE("copy", copy)
+    #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy)
     $:copy_val
 #:enddef
 
@@ -136,18 +136,18 @@
             #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
         #:endif
     #:else
-        #:set copyin_val = ""
+        #:set copyin_val = ''
     #:endif
     $:copyin_val
 #:enddef
 
 #:def GEN_COPYOUT_STR(copyout)
-    #:set copyout_val = GEN_PARENTHESES_CLAUSE("copyout", copyout)
+    #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout)
     $:copyout_val
 #:enddef
 
 #:def GEN_CREATE_STR(create)
-    #:set create_val = GEN_PARENTHESES_CLAUSE("create", create)
+    #:set create_val = GEN_PARENTHESES_CLAUSE('create', create)
     $:create_val
 #:enddef
 
@@ -156,7 +156,7 @@
         #:assert isinstance(extraArgs, str)
         #:set extraArgs_val = extraArgs
     #:else
-        #:set extraArgs_val = ""
+        #:set extraArgs_val = ''
     #:endif
     $:extraArgs_val
 #:enddef
@@ -164,36 +164,41 @@
 #:def GEN_PARALLELISM_STR(parallelism)
     #:if parallelism is not None
         $:ASSERT_LIST(parallelism, str)
-        #:assert all((element == "gang" or element == "worker" or element == "vector" or element == "seq") for element in parallelism)
-        #:set parallelism_val = " ".join(parallelism) + " "
+        #:assert all((element == 'gang' or element == 'worker' or &
+            & element == 'vector' or element == 'seq') for element in parallelism)
+        #:set parallelism_val = ' '.join(parallelism) + ' '
     #:else
-        #:set parallelism_val = ""
+        #:set parallelism_val = ''
     #:endif
+    #:set parallelism_val = parallelism_val.strip('\n')
     $:parallelism_val
 #:enddef
 
-#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=["gang", "vector"], default="present", firstprivate=None, reduction=None, reductionOp=None, copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, extraAccArgs=None)
+#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
+    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, &
+    & extraAccArgs=None)
     #:if collapse is not None
         #:assert isinstance(collapse, int)
         #:assert collapse > 1
         #:set collapse_val = 'collapse(' + str(collapse) + ') '
     #:else
-        #:set collapse_val = ""
+        #:set collapse_val = ''
     #:endif
 
-    #:set private_val = GEN_PARENTHESES_CLAUSE("private", private)
+    #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
 
     #:if default is not None
         #:assert isinstance(default, str)
-        #:assert (default == "present" or default == "none")
+        #:assert (default == 'present' or default == 'none')
         #:set default_val = 'default(' + default + ') '
     #:else
-        #:set default_val = ""
+        #:set default_val = ''
     #:endif
 
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
-    #:set firstprivate_val = GEN_PARENTHESES_CLAUSE("firstprivate", firstprivate)
+    #:set firstprivate_val = GEN_PARENTHESES_CLAUSE('firstprivate', firstprivate)
 
     #:if reduction is not None and reductionOp is not None
         #:if isinstance(reduction, list) and isinstance(reductionOp, list)
@@ -203,18 +208,18 @@
             #:assert all(type(element) == str for sublist in reduction for element in sublist)
             #:assert len(reduction) == len(reductionOp)
             #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
-            #:set reduction_val = " ". join(reduction_list) + " "
+            #:set reduction_val = ' '. join(reduction_list) + ' '
         #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
             $:ASSERT_LIST(reduction, str)
             #:assert isinstance(reductionOp, str)
             #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
         #:else
-            #:stop "Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively"
+            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
         #:endif
     #:elif reduction is not None or reductionOp is not None
-        #:stop "Cannot set the reduction list or reduction operation without setting the other"
+        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
     #:else
-        #:set reduction_val = ""
+        #:set reduction_val = ''
     #:endif
 
     #:set copy_val = GEN_COPY_STR(copy)
@@ -227,25 +232,31 @@
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
-    #:set clause_val = collapse_val + parallelism_val + default_val + private_val + firstprivate_val + reduction_val + copy_val + copyin_val + copyout_val + create_val
-    #:set acc_directive = '!$acc parallel loop ' + clause_val + extraAccArgs_val
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & default_val.strip('\n') + private_val.strip('\n') + &
+        & firstprivate_val.strip('\n') + reduction_val.strip('\n') + &
+        & copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n')
+    #:set acc_directive = '!$acc parallel loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
 
-#:def routine(parallelism=["seq"], nohost=False, extraAccArgs=None)
+#:def routine(parallelism=['seq'], nohost=False, extraAccArgs=None)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
     #:assert isinstance(nohost, bool)
     #:if nohost == True
-        #:set nohost_val = "nohost"
+        #:set nohost_val = 'nohost'
     #:else
-        #:set nohost_val = ""
+        #:set nohost_val = ''
     #:endif
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
-    #:set clause_val = parallelism_val + nohost_val
-    #:set acc_directive = '!$acc routine ' + clause_val + extraAccArgs_val
+    #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
+    #:set acc_directive = '!$acc routine ' + &
+        clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
 
@@ -260,15 +271,18 @@
 
     #:set create_val = GEN_CREATE_STR(create)
 
-    #:set present_val = GEN_PARENTHESES_CLAUSE("present", present)
+    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
 
-    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE("deviceptr", deviceptr)
+    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
 
-    #:set link_val = GEN_PARENTHESES_CLAUSE("link", link)
+    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
-    #:set clause_val = copy_val + copyin_val + copyout_val + create_val + present_val + deviceptr_val + link_val
-    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & present_val.strip('\n') + deviceptr_val.strip('\n') + &
+        & link_val.strip('\n')
+    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index d8f3c0f54a..ab3f331aad 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -1,3 +1,5 @@
+#:include 'macros.fpp'
+
 module m_finite_differences
 
     use m_global_parameters
@@ -16,7 +18,7 @@ contains
 
         real(wp) :: divergence
 
-        !$acc parallel loop collapse(3) private(divergence)
+        $:PARALLEL_LOOP(collapse=3, private=["divergence"])
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 0f84d45222..c6c662ce81 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -885,7 +885,7 @@ contains
 
         $:PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
             "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
-        "dyn_pres_K", "rhoYks", "B"])
+            "dyn_pres_K", "rhoYks", "B"])
         do l = ibounds(3)%beg, ibounds(3)%end
             do k = ibounds(2)%beg, ibounds(2)%end
                 do j = ibounds(1)%beg, ibounds(1)%end
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 0235f71584..ebeef9485d 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -774,7 +774,7 @@ contains
                 ! FD2 or FD4 of RHS at j = 0
                 $:PARALLEL_LOOP(collapse=2, private=["alpha_rho", "vel", "adv", &
                     "mf", "dvel_ds", "dadv_ds", "Re_cbc", "dalpha_rho_ds","dvel_dt", &
-                    "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", & 
+                    "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", &
                     "dYs_ds", "h_k", "Cp_i", "Gamma_i", "Xs"])
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end

From d59e15e2d8e39032e624f81c4719d092ee0c87b5 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Mon, 9 Jun 2025 14:19:56 -0400
Subject: [PATCH 12/75] Added loop macro

---
 src/common/include/macros.fpp | 62 ++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 5e68d30b46..a2b141c77f 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -242,7 +242,7 @@
     $:acc_directive
 #:enddef
 
-#:def routine(parallelism=['seq'], nohost=False, extraAccArgs=None)
+#:def ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
     #:assert isinstance(nohost, bool)
@@ -256,13 +256,11 @@
 
     #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
     #:set acc_directive = '!$acc routine ' + &
-        clause_val + extraAccArgs_val.strip('\n')
+        & clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
 
-
-
-#:def declare(copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+#:def DECLARE(copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
     #:set copy_val = GEN_COPY_STR(copy)
 
     #:set copyin_val = GEN_COPYIN_STR(copyin, copyinReadOnly)
@@ -286,3 +284,57 @@
     #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
+
+#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction_list=None, reductionOp=None, private=None, extraAccArgs=None)
+    #:if collapse is not None
+        #:assert isinstance(collapse, int)
+        #:assert collapse > 1
+        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+    #:else
+        #:set collapse_val = ''
+    #:endif
+
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+
+    #:if data_dependency is not None
+        #:assert isinstance(data_dependency, str)
+        #:assert (data_dependency == 'auto' or data_dependency == 'independent')
+        #:set data_dependency_val = data_dependency 
+    #:else
+        #:set data_dependency_val = ''
+    #:endif
+    
+    #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
+
+    #:if reduction is not None and reductionOp is not None
+        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
+            $:ASSERT_LIST(reduction, list)
+            $:ASSERT_LIST(reductionOp, str)
+            #:assert all(len(element) != 0 for element in reduction)
+            #:assert all(type(element) == str for sublist in reduction for element in sublist)
+            #:assert len(reduction) == len(reductionOp)
+            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
+            #:set reduction_val = ' '. join(reduction_list) + ' '
+        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
+            $:ASSERT_LIST(reduction, str)
+            #:assert isinstance(reductionOp, str)
+            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
+        #:else
+            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
+        #:endif
+    #:elif reduction is not None or reductionOp is not None
+        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
+    #:else
+        #:set reduction_val = ''
+    #:endif
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & data_dependency_val.strip('\n') + private_val.strip('\n') + &
+        & reduction_val.strip('\n')
+    #:set acc_directive = '!$acc serial ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
\ No newline at end of file

From ef035a84ef605b0df97dbe7c1fc2caf34a806a64 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 10 Jun 2025 18:06:41 -0400
Subject: [PATCH 13/75] Added rest of directive macros

---
 src/common/include/macros.fpp | 271 ++++++++++++++++++++++++++--------
 1 file changed, 213 insertions(+), 58 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index a2b141c77f..d79960d87f 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -121,16 +121,26 @@
     $:clause_str
 #:enddef
 
+#:def GEN_PRIVATE_STR(private, initalized_values)
+    #:assert isinstance(initalized_values, bool)
+    #:if initalized_values == True
+        #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private)
+    #:else
+        #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
+    #:endif
+    $:private_val
+#:enddef
+
 #:def GEN_COPY_STR(copy)
     #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy)
     $:copy_val
 #:enddef
 
-#:def GEN_COPYIN_STR(copyin, copyinReadOnly)
+#:def GEN_COPYIN_STR(copyin, readonly)
+    #:assert isinstance(readonly, bool)
     #:if copyin is not None
         $:ASSERT_LIST(copyin, str)
-        #:assert isinstance(copyinReadOnly, bool)
-        #:if copyinReadOnly == True
+        #:if readonly == True
             #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
         #:else
             #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
@@ -151,6 +161,41 @@
     $:create_val
 #:enddef
 
+#:def GEN_NOCREATE_STR(no_create)
+    #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create)
+    $:nocreate_val
+#:enddef
+
+#:def GEN_DELETE_STR(delete)
+    #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete)
+    $:delete_val
+#:enddef
+
+#:def GEN_PRESENT_STR(present)
+    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
+    $:present_val
+#:enddef
+
+#:def GEN_DEVICEPTR_STR(deviceptr)
+    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
+    $:deviceptr_val
+#:enddef
+
+#:def GEN_ATTACH_STR(attach)
+    #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach)
+    $:attach_val
+#:enddef
+
+#:def GEN_DETACH_STR(detach)
+    #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach)
+    $:detach_val
+#:enddef
+
+#:def GEN_LINK_STR(link)
+    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
+    $:link_val
+#:enddef
+
 #:def GEN_EXTRA_ARGS_STR(extraArgs)
     #:if extraArgs is not None
         #:assert isinstance(extraArgs, str)
@@ -170,14 +215,10 @@
     #:else
         #:set parallelism_val = ''
     #:endif
-    #:set parallelism_val = parallelism_val.strip('\n')
     $:parallelism_val
 #:enddef
 
-#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
-    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
-    & copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, &
-    & extraAccArgs=None)
+#:def GEN_COLLAPSE_STR(collapse)
     #:if collapse is not None
         #:assert isinstance(collapse, int)
         #:assert collapse > 1
@@ -185,9 +226,10 @@
     #:else
         #:set collapse_val = ''
     #:endif
+    $:collapse_val
+#:enddef
 
-    #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
-
+#:def GEN_DEFAULT_STR(default)
     #:if default is not None
         #:assert isinstance(default, str)
         #:assert (default == 'present' or default == 'none')
@@ -195,17 +237,16 @@
     #:else
         #:set default_val = ''
     #:endif
+    $:default_val
+#:enddef
 
-    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
-    #:set firstprivate_val = GEN_PARENTHESES_CLAUSE('firstprivate', firstprivate)
-
+#:def GEN_REDUCTION_STR(reduction, reductionOp)
     #:if reduction is not None and reductionOp is not None
         #:if isinstance(reduction, list) and isinstance(reductionOp, list)
             $:ASSERT_LIST(reduction, list)
             $:ASSERT_LIST(reductionOp, str)
             #:assert all(len(element) != 0 for element in reduction)
-            #:assert all(type(element) == str for sublist in reduction for element in sublist)
+            #:assert all(isinstance(element, str) for sublist in reduction for element in sublist)
             #:assert len(reduction) == len(reductionOp)
             #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
             #:set reduction_val = ' '. join(reduction_list) + ' '
@@ -221,22 +262,59 @@
     #:else
         #:set reduction_val = ''
     #:endif
+    $:reduction_val
+#:enddef
 
-    #:set copy_val = GEN_COPY_STR(copy)
+#:def GEN_HOST_STR(host)
+    #:set host_val = GEN_PARENTHESES_CLAUSE('host', host)
+    $:host_val
+#:enddef
+
+#:def GEN_DEVICE_STR(device)
+    #:set device_val = GEN_PARENTHESES_CLAUSE('device', device)
+    $:device_val
+#:enddef
+
+
+#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
+    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
+    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
+
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
+    
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:set default_val = GEN_DEFAULT_STR(default)
 
-    #:set copyin_val = GEN_COPYIN_STR(copyin, copyinReadOnly)
+    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
 
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
 
     #:set create_val = GEN_CREATE_STR(create)
 
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+
+    #:set present_val = GEN_PRESENT_STR(present)
+
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
     #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
-        & default_val.strip('\n') + private_val.strip('\n') + &
-        & firstprivate_val.strip('\n') + reduction_val.strip('\n') + &
+        & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
         & copy_val.strip('\n') + copyin_val.strip('\n') + &
-        & copyout_val.strip('\n') + create_val.strip('\n')
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + &
+        & deviceptr_val.strip('\n') + attach_val.strip('\n')
     #:set acc_directive = '!$acc parallel loop ' + &
         & clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -260,20 +338,21 @@
     $:acc_directive
 #:enddef
 
-#:def DECLARE(copy=None, copyin=None, copyinReadOnly=False, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+#:def DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+    
     #:set copy_val = GEN_COPY_STR(copy)
 
-    #:set copyin_val = GEN_COPYIN_STR(copyin, copyinReadOnly)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
 
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
 
     #:set create_val = GEN_CREATE_STR(create)
 
-    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
+    #:set present_val = GEN_PRESENT_STR(present)
 
-    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
 
-    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
+    #:set link_val = GEN_LINK_STR(link)
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
@@ -285,17 +364,11 @@
     $:acc_directive
 #:enddef
 
-#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction_list=None, reductionOp=None, private=None, extraAccArgs=None)
-    #:if collapse is not None
-        #:assert isinstance(collapse, int)
-        #:assert collapse > 1
-        #:set collapse_val = 'collapse(' + str(collapse) + ') '
-    #:else
-        #:set collapse_val = ''
-    #:endif
+#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
-    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
     #:if data_dependency is not None
         #:assert isinstance(data_dependency, str)
@@ -305,36 +378,118 @@
         #:set data_dependency_val = ''
     #:endif
     
-    #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
+    #:set private_val = GEN_PRIVATE_STR(private, False)
 
-    #:if reduction is not None and reductionOp is not None
-        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
-            $:ASSERT_LIST(reduction, list)
-            $:ASSERT_LIST(reductionOp, str)
-            #:assert all(len(element) != 0 for element in reduction)
-            #:assert all(type(element) == str for sublist in reduction for element in sublist)
-            #:assert len(reduction) == len(reductionOp)
-            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
-            #:set reduction_val = ' '. join(reduction_list) + ' '
-        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
-            $:ASSERT_LIST(reduction, str)
-            #:assert isinstance(reductionOp, str)
-            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
-        #:else
-            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
-        #:endif
-    #:elif reduction is not None or reductionOp is not None
-        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
-    #:else
-        #:set reduction_val = ''
-    #:endif
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
     #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
         & data_dependency_val.strip('\n') + private_val.strip('\n') + &
         & reduction_val.strip('\n')
-    #:set acc_directive = '!$acc serial ' + &
+    #:set acc_directive = '!$acc loop ' + &
         & clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
-#:enddef
\ No newline at end of file
+#:enddef
+
+#:def DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
+    
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+
+    #:set present_val = GEN_PRESENT_STR(present)
+
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
+    #:set default_val = GEN_DEFAULT_STR(default)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + & 
+        & deviceptr_val.strip('\n') + attach_val.strip('\n') + &
+        & default_val.strip('\n')
+    #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set delete_val = GEN_DELETE_STR(delete)
+
+    #:set detach_val = GEN_DETACH_STR(detach)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n')
+    #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def CACHE(cache, extraAccArgs=None)
+    #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = cache_val.strip('\n')
+    #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def ATOMIC(atomic='update', extraAccArgs=None)
+    #:assert isinstance(atomic, str)
+    #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
+
+    #:set atomic_val = atomic
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = atomic_val.strip('\n')
+    #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def UPDATE(host=None, device=None, extraAccArgs=None)
+    #:set host_val = GEN_HOST_STR(host)
+
+    #:set device_val = GEN_DEVICE_STR(device)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
+    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def WAIT(host=None, device=None, extraAccArgs=None)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
+    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef

From c4332d813f377f1533a5611cd75ce707446a65d6 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 10 Jun 2025 19:56:31 -0400
Subject: [PATCH 14/75] Moved directive macros to seperate file

---
 src/common/include/directive_macros.fpp | 391 +++++++++++++++++++++++
 src/common/include/macros.fpp           | 392 +-----------------------
 src/common/m_boundary_common.fpp        |   1 +
 src/common/m_chemistry.fpp              |   1 +
 src/common/m_finite_differences.fpp     |   1 +
 src/common/m_phase_change.fpp           |   1 +
 src/common/m_variables_conversion.fpp   |   1 +
 src/simulation/m_acoustic_src.fpp       |   1 +
 src/simulation/m_body_forces.fpp        |   1 +
 src/simulation/m_bubbles_EE.fpp         |   1 +
 src/simulation/m_bubbles_EL.fpp         |   1 +
 src/simulation/m_cbc.fpp                |   1 +
 src/simulation/m_data_output.fpp        |   1 +
 src/simulation/m_fftw.fpp               |   1 +
 src/simulation/m_hyperelastic.fpp       |   1 +
 src/simulation/m_hypoelastic.fpp        |   1 +
 src/simulation/m_mhd.fpp                |   1 +
 src/simulation/m_qbmm.fpp               |   1 +
 src/simulation/m_rhs.fpp                |   1 +
 src/simulation/m_riemann_solvers.fpp    |   1 +
 src/simulation/m_surface_tension.fpp    |   1 +
 src/simulation/m_time_steppers.fpp      |   1 +
 src/simulation/m_viscous.fpp            |   1 +
 src/simulation/m_weno.fpp               |   1 +
 24 files changed, 414 insertions(+), 391 deletions(-)
 create mode 100644 src/common/include/directive_macros.fpp

diff --git a/src/common/include/directive_macros.fpp b/src/common/include/directive_macros.fpp
new file mode 100644
index 0000000000..2760fd1e9e
--- /dev/null
+++ b/src/common/include/directive_macros.fpp
@@ -0,0 +1,391 @@
+#:def ASSERT_LIST(data, datatype)
+    #:assert data is not None
+    #:assert isinstance(data, list)
+    #:assert len(data) != 0
+    #:assert all(isinstance(element, datatype) for element in data)
+#:enddef
+
+#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_list)
+    #:assert isinstance(clause_name, str)
+    #:if clause_list is not None
+        $:ASSERT_LIST(clause_list, str)
+        #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
+    #:else
+        #:set clause_str = ''
+    #:endif
+    $:clause_str
+#:enddef
+
+#:def GEN_PRIVATE_STR(private, initalized_values)
+    #:assert isinstance(initalized_values, bool)
+    #:if initalized_values == True
+        #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private)
+    #:else
+        #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
+    #:endif
+    $:private_val
+#:enddef
+
+#:def GEN_COPY_STR(copy)
+    #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy)
+    $:copy_val
+#:enddef
+
+#:def GEN_COPYIN_STR(copyin, readonly)
+    #:assert isinstance(readonly, bool)
+    #:if copyin is not None
+        $:ASSERT_LIST(copyin, str)
+        #:if readonly == True
+            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
+        #:else
+            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
+        #:endif
+    #:else
+        #:set copyin_val = ''
+    #:endif
+    $:copyin_val
+#:enddef
+
+#:def GEN_COPYOUT_STR(copyout)
+    #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout)
+    $:copyout_val
+#:enddef
+
+#:def GEN_CREATE_STR(create)
+    #:set create_val = GEN_PARENTHESES_CLAUSE('create', create)
+    $:create_val
+#:enddef
+
+#:def GEN_NOCREATE_STR(no_create)
+    #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create)
+    $:nocreate_val
+#:enddef
+
+#:def GEN_DELETE_STR(delete)
+    #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete)
+    $:delete_val
+#:enddef
+
+#:def GEN_PRESENT_STR(present)
+    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
+    $:present_val
+#:enddef
+
+#:def GEN_DEVICEPTR_STR(deviceptr)
+    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
+    $:deviceptr_val
+#:enddef
+
+#:def GEN_ATTACH_STR(attach)
+    #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach)
+    $:attach_val
+#:enddef
+
+#:def GEN_DETACH_STR(detach)
+    #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach)
+    $:detach_val
+#:enddef
+
+#:def GEN_LINK_STR(link)
+    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
+    $:link_val
+#:enddef
+
+#:def GEN_EXTRA_ARGS_STR(extraArgs)
+    #:if extraArgs is not None
+        #:assert isinstance(extraArgs, str)
+        #:set extraArgs_val = extraArgs
+    #:else
+        #:set extraArgs_val = ''
+    #:endif
+    $:extraArgs_val
+#:enddef
+
+#:def GEN_PARALLELISM_STR(parallelism)
+    #:if parallelism is not None
+        $:ASSERT_LIST(parallelism, str)
+        #:assert all((element == 'gang' or element == 'worker' or &
+            & element == 'vector' or element == 'seq') for element in parallelism)
+        #:set parallelism_val = ' '.join(parallelism) + ' '
+    #:else
+        #:set parallelism_val = ''
+    #:endif
+    $:parallelism_val
+#:enddef
+
+#:def GEN_COLLAPSE_STR(collapse)
+    #:if collapse is not None
+        #:assert isinstance(collapse, int)
+        #:assert collapse > 1
+        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+    #:else
+        #:set collapse_val = ''
+    #:endif
+    $:collapse_val
+#:enddef
+
+#:def GEN_DEFAULT_STR(default)
+    #:if default is not None
+        #:assert isinstance(default, str)
+        #:assert (default == 'present' or default == 'none')
+        #:set default_val = 'default(' + default + ') '
+    #:else
+        #:set default_val = ''
+    #:endif
+    $:default_val
+#:enddef
+
+#:def GEN_REDUCTION_STR(reduction, reductionOp)
+    #:if reduction is not None and reductionOp is not None
+        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
+            $:ASSERT_LIST(reduction, list)
+            $:ASSERT_LIST(reductionOp, str)
+            #:assert all(len(element) != 0 for element in reduction)
+            #:assert all(isinstance(element, str) for sublist in reduction for element in sublist)
+            #:assert len(reduction) == len(reductionOp)
+            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
+            #:set reduction_val = ' '. join(reduction_list) + ' '
+        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
+            $:ASSERT_LIST(reduction, str)
+            #:assert isinstance(reductionOp, str)
+            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
+        #:else
+            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
+        #:endif
+    #:elif reduction is not None or reductionOp is not None
+        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
+    #:else
+        #:set reduction_val = ''
+    #:endif
+    $:reduction_val
+#:enddef
+
+#:def GEN_HOST_STR(host)
+    #:set host_val = GEN_PARENTHESES_CLAUSE('host', host)
+    $:host_val
+#:enddef
+
+#:def GEN_DEVICE_STR(device)
+    #:set device_val = GEN_PARENTHESES_CLAUSE('device', device)
+    $:device_val
+#:enddef
+
+
+#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
+    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
+    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
+
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
+    
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:set default_val = GEN_DEFAULT_STR(default)
+
+    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
+
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+
+    #:set present_val = GEN_PRESENT_STR(present)
+
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
+        & copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + &
+        & deviceptr_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc parallel loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:assert isinstance(nohost, bool)
+    #:if nohost == True
+        #:set nohost_val = 'nohost'
+    #:else
+        #:set nohost_val = ''
+    #:endif
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
+    #:set acc_directive = '!$acc routine ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+    
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set present_val = GEN_PRESENT_STR(present)
+
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+
+    #:set link_val = GEN_LINK_STR(link)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & present_val.strip('\n') + deviceptr_val.strip('\n') + &
+        & link_val.strip('\n')
+    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
+
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+
+    #:if data_dependency is not None
+        #:assert isinstance(data_dependency, str)
+        #:assert (data_dependency == 'auto' or data_dependency == 'independent')
+        #:set data_dependency_val = data_dependency 
+    #:else
+        #:set data_dependency_val = ''
+    #:endif
+    
+    #:set private_val = GEN_PRIVATE_STR(private, False)
+
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & data_dependency_val.strip('\n') + private_val.strip('\n') + &
+        & reduction_val.strip('\n')
+    #:set acc_directive = '!$acc loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
+    
+    #:set copy_val = GEN_COPY_STR(copy)
+
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+
+    #:set present_val = GEN_PRESENT_STR(present)
+
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
+    #:set default_val = GEN_DEFAULT_STR(default)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + & 
+        & deviceptr_val.strip('\n') + attach_val.strip('\n') + &
+        & default_val.strip('\n')
+    #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+
+    #:set create_val = GEN_CREATE_STR(create)
+
+    #:set attach_val = GEN_ATTACH_STR(attach)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+
+    #:set delete_val = GEN_DELETE_STR(delete)
+
+    #:set detach_val = GEN_DETACH_STR(detach)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n')
+    #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def CACHE(cache, extraAccArgs=None)
+    #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = cache_val.strip('\n')
+    #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def ATOMIC(atomic='update', extraAccArgs=None)
+    #:assert isinstance(atomic, str)
+    #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
+
+    #:set atomic_val = atomic
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = atomic_val.strip('\n')
+    #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def UPDATE(host=None, device=None, extraAccArgs=None)
+    #:set host_val = GEN_HOST_STR(host)
+
+    #:set device_val = GEN_DEVICE_STR(device)
+
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
+    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def WAIT(host=None, device=None, extraAccArgs=None)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
+    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+! New line at end of file is required for FYPP
\ No newline at end of file
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index d79960d87f..679c276bf4 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -102,394 +102,4 @@
                          //${message or '"No error description."'}$)
     end if
 #:enddef
-
-#:def ASSERT_LIST(data, datatype)
-    #:assert data is not None
-    #:assert isinstance(data, list)
-    #:assert len(data) != 0
-    #:assert all(isinstance(element, datatype) for element in data)
-#:enddef
-
-#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_list)
-    #:assert isinstance(clause_name, str)
-    #:if clause_list is not None
-        $:ASSERT_LIST(clause_list, str)
-        #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
-    #:else
-        #:set clause_str = ''
-    #:endif
-    $:clause_str
-#:enddef
-
-#:def GEN_PRIVATE_STR(private, initalized_values)
-    #:assert isinstance(initalized_values, bool)
-    #:if initalized_values == True
-        #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private)
-    #:else
-        #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
-    #:endif
-    $:private_val
-#:enddef
-
-#:def GEN_COPY_STR(copy)
-    #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy)
-    $:copy_val
-#:enddef
-
-#:def GEN_COPYIN_STR(copyin, readonly)
-    #:assert isinstance(readonly, bool)
-    #:if copyin is not None
-        $:ASSERT_LIST(copyin, str)
-        #:if readonly == True
-            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
-        #:else
-            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
-        #:endif
-    #:else
-        #:set copyin_val = ''
-    #:endif
-    $:copyin_val
-#:enddef
-
-#:def GEN_COPYOUT_STR(copyout)
-    #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout)
-    $:copyout_val
-#:enddef
-
-#:def GEN_CREATE_STR(create)
-    #:set create_val = GEN_PARENTHESES_CLAUSE('create', create)
-    $:create_val
-#:enddef
-
-#:def GEN_NOCREATE_STR(no_create)
-    #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create)
-    $:nocreate_val
-#:enddef
-
-#:def GEN_DELETE_STR(delete)
-    #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete)
-    $:delete_val
-#:enddef
-
-#:def GEN_PRESENT_STR(present)
-    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
-    $:present_val
-#:enddef
-
-#:def GEN_DEVICEPTR_STR(deviceptr)
-    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
-    $:deviceptr_val
-#:enddef
-
-#:def GEN_ATTACH_STR(attach)
-    #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach)
-    $:attach_val
-#:enddef
-
-#:def GEN_DETACH_STR(detach)
-    #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach)
-    $:detach_val
-#:enddef
-
-#:def GEN_LINK_STR(link)
-    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
-    $:link_val
-#:enddef
-
-#:def GEN_EXTRA_ARGS_STR(extraArgs)
-    #:if extraArgs is not None
-        #:assert isinstance(extraArgs, str)
-        #:set extraArgs_val = extraArgs
-    #:else
-        #:set extraArgs_val = ''
-    #:endif
-    $:extraArgs_val
-#:enddef
-
-#:def GEN_PARALLELISM_STR(parallelism)
-    #:if parallelism is not None
-        $:ASSERT_LIST(parallelism, str)
-        #:assert all((element == 'gang' or element == 'worker' or &
-            & element == 'vector' or element == 'seq') for element in parallelism)
-        #:set parallelism_val = ' '.join(parallelism) + ' '
-    #:else
-        #:set parallelism_val = ''
-    #:endif
-    $:parallelism_val
-#:enddef
-
-#:def GEN_COLLAPSE_STR(collapse)
-    #:if collapse is not None
-        #:assert isinstance(collapse, int)
-        #:assert collapse > 1
-        #:set collapse_val = 'collapse(' + str(collapse) + ') '
-    #:else
-        #:set collapse_val = ''
-    #:endif
-    $:collapse_val
-#:enddef
-
-#:def GEN_DEFAULT_STR(default)
-    #:if default is not None
-        #:assert isinstance(default, str)
-        #:assert (default == 'present' or default == 'none')
-        #:set default_val = 'default(' + default + ') '
-    #:else
-        #:set default_val = ''
-    #:endif
-    $:default_val
-#:enddef
-
-#:def GEN_REDUCTION_STR(reduction, reductionOp)
-    #:if reduction is not None and reductionOp is not None
-        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
-            $:ASSERT_LIST(reduction, list)
-            $:ASSERT_LIST(reductionOp, str)
-            #:assert all(len(element) != 0 for element in reduction)
-            #:assert all(isinstance(element, str) for sublist in reduction for element in sublist)
-            #:assert len(reduction) == len(reductionOp)
-            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
-            #:set reduction_val = ' '. join(reduction_list) + ' '
-        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
-            $:ASSERT_LIST(reduction, str)
-            #:assert isinstance(reductionOp, str)
-            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
-        #:else
-            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
-        #:endif
-    #:elif reduction is not None or reductionOp is not None
-        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
-    #:else
-        #:set reduction_val = ''
-    #:endif
-    $:reduction_val
-#:enddef
-
-#:def GEN_HOST_STR(host)
-    #:set host_val = GEN_PARENTHESES_CLAUSE('host', host)
-    $:host_val
-#:enddef
-
-#:def GEN_DEVICE_STR(device)
-    #:set device_val = GEN_PARENTHESES_CLAUSE('device', device)
-    $:device_val
-#:enddef
-
-
-#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
-    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
-    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
-    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
-
-    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
-    
-    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
-    #:set default_val = GEN_DEFAULT_STR(default)
-
-    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
-
-    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
-
-    #:set copy_val = GEN_COPY_STR(copy)
-
-    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-    
-    #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
-    #:set create_val = GEN_CREATE_STR(create)
-
-    #:set no_create_val = GEN_NOCREATE_STR(no_create)
-
-    #:set present_val = GEN_PRESENT_STR(present)
-
-    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
-    #:set attach_val = GEN_ATTACH_STR(attach)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
-        & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
-        & copy_val.strip('\n') + copyin_val.strip('\n') + &
-        & copyout_val.strip('\n') + create_val.strip('\n') + &
-        & no_create_val.strip('\n') + present_val.strip('\n') + &
-        & deviceptr_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc parallel loop ' + &
-        & clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
-    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
-    #:assert isinstance(nohost, bool)
-    #:if nohost == True
-        #:set nohost_val = 'nohost'
-    #:else
-        #:set nohost_val = ''
-    #:endif
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
-    #:set acc_directive = '!$acc routine ' + &
-        & clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
-    
-    #:set copy_val = GEN_COPY_STR(copy)
-
-    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
-    #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
-    #:set create_val = GEN_CREATE_STR(create)
-
-    #:set present_val = GEN_PRESENT_STR(present)
-
-    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
-    #:set link_val = GEN_LINK_STR(link)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
-        & copyout_val.strip('\n') + create_val.strip('\n') + &
-        & present_val.strip('\n') + deviceptr_val.strip('\n') + &
-        & link_val.strip('\n')
-    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
-
-    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
-
-    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
-    #:if data_dependency is not None
-        #:assert isinstance(data_dependency, str)
-        #:assert (data_dependency == 'auto' or data_dependency == 'independent')
-        #:set data_dependency_val = data_dependency 
-    #:else
-        #:set data_dependency_val = ''
-    #:endif
-    
-    #:set private_val = GEN_PRIVATE_STR(private, False)
-
-    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
-        & data_dependency_val.strip('\n') + private_val.strip('\n') + &
-        & reduction_val.strip('\n')
-    #:set acc_directive = '!$acc loop ' + &
-        & clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
-    
-    #:set copy_val = GEN_COPY_STR(copy)
-
-    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
-    #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
-    #:set create_val = GEN_CREATE_STR(create)
-
-    #:set no_create_val = GEN_NOCREATE_STR(no_create)
-
-    #:set present_val = GEN_PRESENT_STR(present)
-
-    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
-    #:set attach_val = GEN_ATTACH_STR(attach)
-
-    #:set default_val = GEN_DEFAULT_STR(default)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
-        & copyout_val.strip('\n') + create_val.strip('\n') + &
-        & no_create_val.strip('\n') + present_val.strip('\n') + & 
-        & deviceptr_val.strip('\n') + attach_val.strip('\n') + &
-        & default_val.strip('\n')
-    #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
-    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
-    #:set create_val = GEN_CREATE_STR(create)
-
-    #:set attach_val = GEN_ATTACH_STR(attach)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
-    #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
-    #:set delete_val = GEN_DELETE_STR(delete)
-
-    #:set detach_val = GEN_DETACH_STR(detach)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n')
-    #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def CACHE(cache, extraAccArgs=None)
-    #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = cache_val.strip('\n')
-    #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def ATOMIC(atomic='update', extraAccArgs=None)
-    #:assert isinstance(atomic, str)
-    #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
-
-    #:set atomic_val = atomic
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = atomic_val.strip('\n')
-    #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def UPDATE(host=None, device=None, extraAccArgs=None)
-    #:set host_val = GEN_HOST_STR(host)
-
-    #:set device_val = GEN_DEVICE_STR(device)
-
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
-    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
-
-#:def WAIT(host=None, device=None, extraAccArgs=None)
-    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
-    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
-    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
-#:enddef
+! New line at end of file is required for FYPP
\ No newline at end of file
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 3b8732c593..eed3cc5086 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -6,6 +6,7 @@
 !! boundary condiitons
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_boundary_common
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 416ecc63cc..e7c42b36f6 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -4,6 +4,7 @@
 !! @author Henry Le Berre <hberre3@gatech.edu>
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 #:include 'case.fpp'
 
 module m_chemistry
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index ab3f331aad..18a0a84484 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -1,4 +1,5 @@
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_finite_differences
 
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 48a2b748a0..ba911005bf 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -2,6 +2,7 @@
 !> procedure.
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_phase_change
 
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index c6c662ce81..024934df1e 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_variables_conversion
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 #:include 'case.fpp'
 
 !> @brief This module consists of subroutines used in the conversion of the
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index a1bc61efe8..0fa8f33566 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_acoustic_src
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module contains the subroutines used to create a acoustic source pressure source term
 module m_acoustic_src
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index b0daf8dd50..4080a7ddf1 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -1,4 +1,5 @@
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_body_forces
 
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index ca318f8cee..dfe0f3da49 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_bubbles_EE
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module is used to compute the ensemble-averaged bubble dynamic variables
 module m_bubbles_EE
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 5b6c818231..b745739058 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_bubbles_EL
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module is used to to compute the volume-averaged bubble model
 module m_bubbles_EL
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index ebeef9485d..73458d56f8 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -19,6 +19,7 @@
 !!              Please refer to Thompson (1987, 1990) for detailed descriptions.
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_cbc
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 3d87014ea1..ca0d7dbb8d 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_data_output
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The primary purpose of this module is to output the grid and the
 !!              conservative variables data at the chosen time-step interval. In
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 0a32961dce..10e063496c 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_fftw
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module contains the subroutines for the FFT routines
 module m_fftw
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index e1a5d34a27..cd0552d438 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_hyperelastic
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module consists of subroutines used in the calculation
 !!              of the cauchy tensor
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index b298833cf5..68032cb5b8 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_hypoelastic
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module is used to compute source terms for hypoelastic model
 module m_hypoelastic
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 9b87a5a3be..8e6ea6c820 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_mhd
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module is used to compute source terms for magnetohydrodynamics
 !!              Note: not applicable for 1D
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 924a0ed758..edaf2ee367 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_qbmm
 
 #:include 'case.fpp'
+#:include 'directive_macros.fpp'
 #:include 'macros.fpp'
 
 !> @brief This module is used to compute moment inversion via qbmm
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 87c362aaea..730ccba98b 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -4,6 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module contains the subroutines used to calculate the right-
 !!              hane-side (RHS) in the quasi-conservative, shock- and interface-
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 594e0009e3..d82ce6b154 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -21,6 +21,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 #:include 'inline_riemann.fpp'
 
 module m_riemann_solvers
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 8b4d3d923f..a3be874f86 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -1,4 +1,5 @@
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 #:include 'inline_capillary.fpp'
 
 !> @brief This module is used to compute source terms for surface tension model
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 7ee08b1223..7c1c986ac9 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_time_steppers
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The following module features a variety of time-stepping schemes.
 !!              Currently, it includes the following Runge-Kutta (RK) algorithms:
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 244230c01f..cd48b01e50 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -2,6 +2,7 @@
 !! @file m_viscous.f90
 !! @brief Contains module m_viscous
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module contains the subroutines used to compute viscous terms.
 module m_viscous
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index ad9551e096..1915254cc3 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -2,6 +2,7 @@
 !! @file m_weno.f90
 !! @brief Contains module m_weno
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief  Weighted essentially non-oscillatory (WENO) reconstruction scheme
 !!              that is supplemented with monotonicity preserving bounds (MPWENO)

From 25a57168b666e59654d7594746e8c127f7bcc875 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 10 Jun 2025 20:15:51 -0400
Subject: [PATCH 15/75] Fixed spelling issue

---
 src/common/include/directive_macros.fpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/include/directive_macros.fpp b/src/common/include/directive_macros.fpp
index 2760fd1e9e..4e6e62f746 100644
--- a/src/common/include/directive_macros.fpp
+++ b/src/common/include/directive_macros.fpp
@@ -16,9 +16,9 @@
     $:clause_str
 #:enddef
 
-#:def GEN_PRIVATE_STR(private, initalized_values)
-    #:assert isinstance(initalized_values, bool)
-    #:if initalized_values == True
+#:def GEN_PRIVATE_STR(private, initialized_values)
+    #:assert isinstance(initialized_values, bool)
+    #:if initialized_values == True
         #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private)
     #:else
         #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)

From e8d78e8bf408163c419148c8a9e605d8a36e8719 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 10 Jun 2025 22:16:23 -0400
Subject: [PATCH 16/75] Replaced Openacc routine directives

---
 src/common/m_boundary_common.fpp              | 22 ++++++------
 src/common/m_helper.fpp                       |  5 +--
 ...{m_helper_basic.f90 => m_helper_basic.fpp} |  7 ++--
 src/common/m_phase_change.fpp                 | 12 +++----
 src/common/m_variables_conversion.fpp         | 10 +++---
 src/pre_process/m_assign_variables.fpp        |  5 +--
 src/pre_process/m_patches.fpp                 |  9 ++---
 src/simulation/m_acoustic_src.fpp             |  6 ++--
 src/simulation/m_bubbles.fpp                  | 35 ++++++++++---------
 src/simulation/m_bubbles_EL.fpp               |  4 +--
 src/simulation/m_bubbles_EL_kernels.fpp       | 13 +++----
 src/simulation/m_compute_cbc.fpp              | 17 ++++-----
 src/simulation/m_hyperelastic.fpp             |  4 +--
 src/simulation/m_ibm.fpp                      |  3 +-
 src/simulation/m_qbmm.fpp                     | 12 +++----
 src/simulation/m_riemann_solvers.fpp          |  4 +--
 .../{m_sim_helpers.f90 => m_sim_helpers.fpp}  |  7 ++--
 17 files changed, 92 insertions(+), 83 deletions(-)
 rename src/common/{m_helper_basic.f90 => m_helper_basic.fpp} (97%)
 rename src/simulation/{m_sim_helpers.f90 => m_sim_helpers.fpp} (99%)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index eed3cc5086..a3460a3950 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -242,7 +242,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -311,7 +311,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_symmetry
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -575,7 +575,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_periodic
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -718,7 +718,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_axis
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -782,7 +782,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_slip_wall
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -881,7 +881,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_no_slip_wall
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1016,7 +1016,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_dirichlet
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1085,7 +1085,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_qbmm_extrapolation
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -1284,7 +1284,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_periodic
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1342,7 +1342,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_reflective
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1424,7 +1424,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 4be5974c2e..a73afddda0 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -1,4 +1,5 @@
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 !>
 !! @file m_helper.f90
 !! @brief Contains module m_helper
@@ -43,7 +44,7 @@ contains
         !! @param Rtmp is the  bubble radii
         !! @param ntmp is the output number bubble density
     pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out) :: ntmp
@@ -57,7 +58,7 @@ contains
     end subroutine s_comp_n_from_prim
 
     pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out) :: ntmp
diff --git a/src/common/m_helper_basic.f90 b/src/common/m_helper_basic.fpp
similarity index 97%
rename from src/common/m_helper_basic.f90
rename to src/common/m_helper_basic.fpp
index 4205279cd4..d0ace9e551 100644
--- a/src/common/m_helper_basic.f90
+++ b/src/common/m_helper_basic.fpp
@@ -1,3 +1,4 @@
+#:include 'directive_macros.fpp'
 !>
 !! @file m_helper_basic.f90
 !! @brief Contains module m_helper_basic
@@ -23,7 +24,7 @@ module m_helper_basic
     !! @param tol_input Relative error (default = 1e-6_wp).
     !! @return Result of the comparison.
     logical pure elemental function f_approx_equal(a, b, tol_input) result(res)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: a, b
         real(wp), optional, intent(in) :: tol_input
         real(wp) :: tol
@@ -46,7 +47,7 @@ end function f_approx_equal
     !> Checks if a real(wp) variable is of default value.
     !! @param var Variable to check.
     logical pure elemental function f_is_default(var) result(res)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, dflt_real)
@@ -71,7 +72,7 @@ end function f_all_default
     !> Checks if a real(wp) variable is an integer.
     !! @param var Variable to check.
     logical pure elemental function f_is_integer(var) result(res)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, real(nint(var), wp))
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index ba911005bf..fd03e5f09a 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -290,7 +290,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         ! initializing variables
@@ -395,7 +395,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         integer, intent(in) :: j, k, l
@@ -520,7 +520,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_correct_partial_densities
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         !> @name variables for the correction of the reacting partial densities
@@ -583,7 +583,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_jacobian_matrix
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
@@ -690,7 +690,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pTg_residue
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         integer, intent(in) :: j, k, l
@@ -741,7 +741,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_TSat
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(in) :: pSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 024934df1e..7523afd13d 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -121,7 +121,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pressure
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(in) :: energy, alf
@@ -469,7 +469,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -552,7 +552,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -1618,7 +1618,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_speed_of_sound
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(in) :: pres
@@ -1688,7 +1688,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(in) :: B(3), rho, c
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index 11eb62f9e6..1adfd6ab1f 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_assign_variables
 
 #:include 'case.fpp'
+#:include 'directive_macros.fpp'
 
 module m_assign_variables
 
@@ -103,7 +104,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, &
                                                                eta, q_prim_vf, patch_id_fp)
-        !$acc routine seq
+        $:ROUTINE()
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
@@ -276,7 +277,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     impure subroutine s_assign_patch_species_primitive_variables(patch_id, j, k, l, &
                                                                  eta, q_prim_vf, patch_id_fp)
-        !$acc routine seq
+        $:ROUTINE()
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index 2425553d3f..584048aa02 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -7,6 +7,7 @@
 #:include '2dHardcodedIC.fpp'
 #:include '3dHardcodedIC.fpp'
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 module m_patches
 
@@ -2386,7 +2387,7 @@ contains
     end subroutine s_model
 
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
-        !$acc routine seq
+        $:ROUTINE()
 
         real(wp), intent(in) :: cyl_y, cyl_z
 
@@ -2397,7 +2398,7 @@ contains
 
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
-        !$acc routine seq
+        $:ROUTINE()
 
         t_vec3, intent(in) :: cyl
         t_vec3 :: cart
@@ -2409,7 +2410,7 @@ contains
     end function f_convert_cyl_to_cart
 
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
-        !$acc routine seq
+        $:ROUTINE()
 
         real(wp), intent(IN) :: cyl_x, cyl_y
 
@@ -2422,7 +2423,7 @@ contains
     !! @param offset Thickness
     !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: myth, offset, a
         real(wp) :: b
         real(wp) :: f_r
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 0fa8f33566..7c15bb5cf1 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -340,7 +340,7 @@ contains
     !! @param gauss_sigma_time_local sigma in time for Gaussian pulse
     !! @param source Source term amplitude
     pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB)
-        !$acc routine seq
+        $:ROUTINE()
         integer, intent(in) :: ai, term_index
         real(wp), intent(in) :: sim_time, c, sum_BB
         real(wp), intent(in) :: frequency_local, gauss_sigma_time_local
@@ -692,7 +692,7 @@ contains
     !! @param c Speed of sound
     !! @return frequency_local Converted frequency
     pure elemental function f_frequency_local(freq_conv_flag, ai, c)
-        !$acc routine seq
+        $:ROUTINE()
         logical, intent(in) :: freq_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
@@ -711,7 +711,7 @@ contains
     !! @param ai Acoustic source index
     !! @return gauss_sigma_time_local Converted Gaussian sigma time
     pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c)
-        !$acc routine seq
+        $:ROUTINE()
         logical, intent(in) :: gauss_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index bf008b9b00..b64c28a4ee 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_bubbles
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module contains the procedures shared by the ensemble-averaged and volume-averaged bubble models.
 module m_bubbles
@@ -39,7 +40,7 @@ contains
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
     pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(in) :: fCson
@@ -80,7 +81,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw(fR0, fR, fV, fpb)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
         real(wp) :: f_cpbw
@@ -99,7 +100,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
         real(wp) :: tmp1, tmp2, tmp3
@@ -119,7 +120,7 @@ contains
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
     pure elemental function f_cgas(fCpinf, fntait, fBtait, fH)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
         real(wp) :: tmp
@@ -142,7 +143,7 @@ contains
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
     pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
         real(wp) :: c2_liquid
@@ -172,7 +173,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
     pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
 
@@ -208,7 +209,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fCpbw Boundary wall pressure
     pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fR0, fCpbw)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fCp, fRho, fR, fV, fR0, fCpbw
 
         real(wp) :: f_rddot_RP
@@ -231,7 +232,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
 
@@ -254,7 +255,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw_KM(fR0, fR, fV, fpb)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
 
@@ -281,7 +282,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
     pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
 
@@ -315,7 +316,7 @@ contains
         !!  @param pb Internal bubble pressure
         !!  @param iR0 Current bubble size index
     pure elemental subroutine s_bwproperty(pb, iR0, chi_vw, k_mw, rho_mw)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: pb
         integer, intent(in) :: iR0
         real(wp), intent(out) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -345,7 +346,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
         real(wp), intent(in) :: fpb
@@ -403,7 +404,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -466,7 +467,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_step
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
@@ -600,7 +601,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_initial_substep_h
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -686,7 +687,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_substep
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
@@ -784,7 +785,7 @@ contains
         !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
     pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
                                            fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
         real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t
         integer, intent(IN) :: bub_id
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index b745739058..f83b64a3fd 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -738,7 +738,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_cson_from_pinf
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, intent(in) :: bub_id
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
@@ -809,7 +809,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_pinf
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index 47566eac19..c173c82953 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_bubbles_EL_kernels
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module contains kernel functions used to map the effect of the lagrangian bubbles
 !!        in the Eulerian framework.
@@ -203,7 +204,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_applygaussian
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
@@ -273,7 +274,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_check_celloutside
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
@@ -309,7 +310,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
@@ -350,7 +351,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_stddsv
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
@@ -391,7 +392,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_char_vol
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
@@ -416,7 +417,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_cell
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 275acb7f6f..23136a0198 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -1,3 +1,4 @@
+#:include 'directive_macros.fpp'
 !>
 !! @file m_compute_cbc.f90
 !! @brief Contains module m_compute_cbc
@@ -27,7 +28,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -57,7 +58,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -107,7 +108,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -141,7 +142,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -190,7 +191,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -228,7 +229,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -267,7 +268,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -298,7 +299,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index cd0552d438..487e0ceb64 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -221,7 +221,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_neoHookean_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        !$acc routine seq
+        $:ROUTINE()
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -260,7 +260,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        !$acc routine seq
+        $:ROUTINE()
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 420a024545..31fd99fe99 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_ibm
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief This module is used to handle all operations related to immersed
 !!              boundary methods (IBMs)
@@ -744,7 +745,7 @@ contains
     !> Function that uses the interpolation coefficients and the current state
     !! at the cell centers in order to estimate the state at the image point
     pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP)
-        !$acc routine seq
+        $:ROUTINE()
         type(scalar_field), &
             dimension(sys_size), &
             intent(IN) :: q_prim_vf !< Primitive Variables
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index edaf2ee367..f9ddd1f2d1 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -684,7 +684,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff_nonpoly
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -757,7 +757,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         real(wp), intent(inout) :: pres, rho, c
@@ -1038,7 +1038,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_chyqmom
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(nmom), intent(in) :: momin
         real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
@@ -1105,7 +1105,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_hyqmom
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
         real(wp), dimension(2), intent(inout) :: frho, fup
         real(wp), dimension(3), intent(in) :: fmom
@@ -1124,7 +1124,7 @@ contains
     end subroutine s_hyqmom
 
     pure function f_quad(abscX, abscY, wght_in, q, r, s)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
         real(wp), intent(in) :: q, r, s
 
@@ -1140,7 +1140,7 @@ contains
     end function f_quad
 
     pure function f_quad2D(abscX, abscY, wght_in, pow)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
         real(wp), dimension(3), intent(in) :: pow
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index d82ce6b154..c496ce7632 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -4265,7 +4265,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
     pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        !$acc routine seq
+        $:ROUTINE()
 
         implicit none
 
@@ -4299,7 +4299,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
     pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        !$acc routine seq
+        $:ROUTINE()
 
         implicit none
 
diff --git a/src/simulation/m_sim_helpers.f90 b/src/simulation/m_sim_helpers.fpp
similarity index 99%
rename from src/simulation/m_sim_helpers.f90
rename to src/simulation/m_sim_helpers.fpp
index 64960b1353..85a00f7b13 100644
--- a/src/simulation/m_sim_helpers.f90
+++ b/src/simulation/m_sim_helpers.fpp
@@ -1,3 +1,4 @@
+#:include 'directive_macros.fpp'
 module m_sim_helpers
 
     use m_derived_types        !< Definitions of the derived types
@@ -33,7 +34,7 @@ pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, a
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_enthalpy
 #else
-        !$acc routine seq
+        $:ROUTINE()
 #endif
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
@@ -98,7 +99,7 @@ end subroutine s_compute_enthalpy
         !! @param vcfl_sf (optional) cell centered viscous cfl number
         !! @param Rc_sf (optional) cell centered Rc
     pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
@@ -195,7 +196,7 @@ end subroutine s_compute_stability_from_dt
         !! @param k y coordinate
         !! @param l z coordinate
     pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
-        !$acc routine seq
+        $:ROUTINE()
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt

From 779ea27f0eb8f7b02d2b665004fcf9e0b30e4cb6 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 10 Jun 2025 23:21:34 -0400
Subject: [PATCH 17/75] Replaced non-parallel loops

---
 src/common/m_chemistry.fpp                |  10 +-
 src/common/m_phase_change.fpp             |  12 +-
 src/common/m_variables_conversion.fpp     |  70 +++---
 src/simulation/include/inline_riemann.fpp |   4 +-
 src/simulation/m_acoustic_src.fpp         |   8 +-
 src/simulation/m_bubbles_EE.fpp           |  20 +-
 src/simulation/m_bubbles_EL.fpp           |  14 +-
 src/simulation/m_cbc.fpp                  |  64 +++---
 src/simulation/m_hyperelastic.fpp         |  16 +-
 src/simulation/m_hypoelastic.fpp          |   6 +-
 src/simulation/m_ibm.fpp                  |  24 +-
 src/simulation/m_mhd.fpp                  |   6 +-
 src/simulation/m_qbmm.fpp                 |  24 +-
 src/simulation/m_rhs.fpp                  |  54 ++---
 src/simulation/m_riemann_solvers.fpp      | 262 +++++++++++-----------
 src/simulation/m_sim_helpers.fpp          |   6 +-
 src/simulation/m_viscous.fpp              | 104 ++++-----
 src/simulation/m_weno.fpp                 |   4 +-
 18 files changed, 354 insertions(+), 354 deletions(-)

diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index e7c42b36f6..14b3c3d1a6 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -36,7 +36,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    !$acc loop seq
+                    $:LOOP()
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = &
                             q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
@@ -47,7 +47,7 @@ contains
                     ! cons. contxb    = \rho         (1-fluid model)
                     ! cons. momxb + i = \rho u_i
                     energy = q_cons_vf(E_idx)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
-                    !$acc loop seq
+                    $:LOOP()
                     do eqn = momxb, momxe
                         energy = energy - &
                                  0.5_wp*(q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z))**2._wp
@@ -73,7 +73,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    !$acc loop seq
+                    $:LOOP()
                     do i = chemxb, chemxe
                         Ys(i - chemxb + 1) = q_prim_vf(i)%sf(x, y, z)
                     end do
@@ -105,7 +105,7 @@ contains
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
 
-                    !$acc loop seq
+                    $:LOOP()
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z)
                     end do
@@ -115,7 +115,7 @@ contains
 
                     call get_net_production_rates(rho, T, Ys, omega)
 
-                    !$acc loop seq
+                    $:LOOP()
                     do eqn = chemxb, chemxe
 
                         omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index fd03e5f09a..89d4e412fe 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -107,7 +107,7 @@ contains
                 do l = 0, p
 
                     rho = 0.0_wp; TvF = 0.0_wp
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
 
                         ! Mixture density
@@ -133,7 +133,7 @@ contains
 
                     ! kinetic energy as an auxiliary variable to the calculation of the total internal energy
                     dynE = 0.0_wp
-                    !$acc loop seq
+                    $:LOOP()
                     do i = momxb, momxe
 
                         dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho
@@ -255,7 +255,7 @@ contains
 
                     ! calculating volume fractions, internal energies, and total entropy
                     rhos = 0.0_wp
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
 
                         ! volume fractions
@@ -308,7 +308,7 @@ contains
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT = ps_inf; 
         ! Performing tests before initializing the pT-equilibrium
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_fluids
 
             ! sum of the total alpha*rho*cp of the system
@@ -357,7 +357,7 @@ contains
 
             ! updating functions used in the Newton's solver
             gpp = 0.0_wp; gp = 0.0_wp; hp = 0.0_wp
-            !$acc loop seq
+            $:LOOP()
             do i = 1, num_fluids
 
                 gp = gp + (gs_min(i) - 1.0_wp)*q_cons_vf(i + contxb - 1)%sf(j, k, l)*cvs(i) &
@@ -451,7 +451,7 @@ contains
             mCP = 0.0_wp; mCPD = 0.0_wp; mCVGP = 0.0_wp; mCVGP2 = 0.0_wp; mQ = 0.0_wp; mQD = 0.0_wp
             ! Those must be updated through the iterations, as they either depend on
             ! the partial masses for all fluids, or on the equilibrium pressure
-            !$acc loop seq
+            $:LOOP()
             do i = 1, num_fluids
 
                 ! sum of the total alpha*rho*cp of the system
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 7523afd13d..4634ee0e34 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -759,7 +759,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -792,7 +792,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -892,7 +892,7 @@ contains
                 do j = ibounds(1)%beg, ibounds(1)%end
                     dyn_pres_K = 0._wp
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
                         alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l)
                         alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -936,13 +936,13 @@ contains
                         B2 = B(1)**2 + B(2)**2 + B(3)**2
 
                         m2 = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = momxb, momxe
                             m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2
                         end do
 
                         S = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, 3
                             S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i)
                         end do
@@ -950,14 +950,14 @@ contains
                         E = qK_cons_vf(E_idx)%sf(j, k, l)
 
                         D = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             D = D + qK_cons_vf(i)%sf(j, k, l)
                         end do
 
                         ! Newton-Raphson
                         W = E + D
-                        !$acc loop seq
+                        $:LOOP()
                         do iter = 1, relativity_cons_to_prim_max_iter
                             Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2)))
                             pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS
@@ -983,13 +983,13 @@ contains
                         qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2)
 
                         ! Recover the other primitive variables
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, 3
                             qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2)
                         end do
                         qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -999,22 +999,22 @@ contains
 
                     if (chemistry) then
                         rho_K = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = chemxb, chemxe
                             rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l))
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = rho_K
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = chemxb, chemxe
                             qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K)
                         end do
                     else
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -1024,7 +1024,7 @@ contains
                     rho_K = max(rho_K, sgm_eps)
 #endif
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = momxb, momxe
                         if (model_eqns /= 4) then
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) &
@@ -1038,7 +1038,7 @@ contains
                     end do
 
                     if (chemistry) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_species
                             rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l)
                         end do
@@ -1068,7 +1068,7 @@ contains
                     end if
 
                     if (bubbles_euler) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, nb
                             nRtmp(i) = qK_cons_vf(bubrs(i))%sf(j, k, l)
                         end do
@@ -1080,7 +1080,7 @@ contains
                             nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
                             !Convert cons to prim
-                            !$acc loop seq
+                            $:LOOP()
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1097,7 +1097,7 @@ contains
                                 call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight)
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1105,21 +1105,21 @@ contains
                     end if
 
                     if (mhd) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
                     end if
 
                     if (elasticity) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = strxb, strxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
                     if (hypoelasticity) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = strxb, strxe
                             ! subtracting elastic contribution for pressure calculation
                             if (G_K > verysmall) then
@@ -1136,13 +1136,13 @@ contains
                     end if
 
                     if (hyperelasticity) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = xibeg, xiend
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = advxb, advxe
                         qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                     end do
@@ -1487,22 +1487,22 @@ contains
             do k = is2b, is2e
                 do j = is1b, is1e
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, contxe
                         alpha_rho_K(i) = qK_prim_vf(j, k, l, i)
                     end do
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = advxb, advxe
                         alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i)
                     end do
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_vels
                         vel_K(i) = qK_prim_vf(j, k, l, contxe + i)
                     end do
 
                     vel_K_sum = 0._wp
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_vels
                         vel_K_sum = vel_K_sum + vel_K(i)**2._wp
                     end do
@@ -1523,7 +1523,7 @@ contains
                     ! Computing the energy from the pressure
 
                     if (chemistry) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = chemxb, chemxe
                             Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i)
                         end do
@@ -1540,12 +1540,12 @@ contains
                     end if
 
                     ! mass flux, this should be \alpha_i \rho_i u_i
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, contxe
                         FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1))
                     end do
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_vels
                         FK_vf(j, k, l, contxe + dir_idx(i)) = &
                             rho_K*vel_K(dir_idx(1)) &
@@ -1558,14 +1558,14 @@ contains
 
                     ! Species advection Flux, \rho*u*Y
                     if (chemistry) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_species
                             FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i))
                         end do
                     end if
 
                     if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = 0._wp
                             FK_src_vf(j, k, l, i) = alpha_K(i - E_idx)
@@ -1573,12 +1573,12 @@ contains
 
                     else
                         ! Could be bubbles_euler!
-                        !$acc loop seq
+                        $:LOOP()
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx)
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = advxb, advxe
                             FK_src_vf(j, k, l, i) = vel_K(dir_idx(1))
                         end do
@@ -1652,7 +1652,7 @@ contains
                 c = (1._wp/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
             elseif (model_eqns == 3) then
                 c = 0._wp
-                !$acc loop seq
+                $:LOOP()
                 do q = 1, num_fluids
                     c = c + adv(q)*(1._wp/gammas(q) + 1._wp)* &
                         (pres + pi_infs(q)/(gammas(q) + 1._wp))
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index 9ad956a2c8..81e311141a 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -1,7 +1,7 @@
 #:def arithmetic_avg()
     rho_avg = 5e-1_wp*(rho_L + rho_R)
     vel_avg_rms = 0._wp
-    !$acc loop seq
+    $:LOOP()
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
     end do
@@ -17,7 +17,7 @@
 
     vel_avg_rms = 0._wp
 
-    !$acc loop seq
+    $:LOOP()
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (sqrt(rho_L)*vel_L(i) + sqrt(rho_R)*vel_R(i))**2._wp/ &
                       (sqrt(rho_L) + sqrt(rho_R))**2._wp
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 7c15bb5cf1..73d1f14fb8 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -232,7 +232,7 @@ contains
 
                 if (bubbles_euler) then
                     if (num_fluids > 2) then
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, num_fluids - 1
                             myRho = myRho + myalpha_rho(q)
                             B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -246,7 +246,7 @@ contains
                 end if
 
                 if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then
-                    !$acc loop seq
+                    $:LOOP()
                     do q = 1, num_fluids
                         myRho = myRho + myalpha_rho(q)
                         B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -317,11 +317,11 @@ contains
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    !$acc loop seq
+                    $:LOOP()
                     do q = contxb, contxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l)
                     end do
-                    !$acc loop seq
+                    $:LOOP()
                     do q = momxb, momxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l)
                     end do
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index dfe0f3da49..84ac15ecb3 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -82,7 +82,7 @@ contains
             do k = 0, n
                 do j = 0, m
                     nR3bar = 0._wp
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, nb
                         nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp
                     end do
@@ -180,7 +180,7 @@ contains
                 do j = 0, m
                     bub_adv_src(j, k, l) = 0._wp
 
-                    !$acc loop seq
+                    $:LOOP()
                     do q = 1, nb
                         bub_r_src(j, k, l, q) = 0._wp
                         bub_v_src(j, k, l, q) = 0._wp
@@ -202,7 +202,7 @@ contains
                     if (adv_n) then
                         nbub = q_prim_vf(n_idx)%sf(j, k, l)
                     else
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, nb
                             Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l)
                             Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l)
@@ -210,7 +210,7 @@ contains
 
                         R3 = 0._wp
 
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, nb
                             R3 = R3 + weight(q)*Rtmp(q)**3._wp
                         end do
@@ -221,7 +221,7 @@ contains
                     if (.not. adap_dt) then
                         R2Vav = 0._wp
 
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, nb
                             R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q)
                         end do
@@ -229,10 +229,10 @@ contains
                         bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav
                     end if
 
-                    !$acc loop seq
+                    $:LOOP()
                     do q = 1, nb
 
-                        !$acc loop seq
+                        $:LOOP()
                         do ii = 1, num_fluids
                             myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l)
                             myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l)
@@ -243,14 +243,14 @@ contains
                         B_tait = 0._wp
 
                         if (mpp_lim .and. (num_fluids > 2)) then
-                            !$acc loop seq
+                            $:LOOP()
                             do ii = 1, num_fluids
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
                                 B_tait = B_tait + myalpha(ii)*pi_infs(ii)
                             end do
                         else if (num_fluids > 2) then
-                            !$acc loop seq
+                            $:LOOP()
                             do ii = 1, num_fluids - 1
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
@@ -333,7 +333,7 @@ contains
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l)
                         if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = &
                             rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l)
-                        !$acc loop seq
+                        $:LOOP()
                         do k = 1, nb
                             rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k)
                             rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index f83b64a3fd..4fa66a2757 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -570,7 +570,7 @@ contains
             call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2)
 
             ! Obtain liquid density and computing speed of sound from pinf
-            !$acc loop seq
+            $:LOOP()
             do i = 1, num_fluids
                 myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3))
                 myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3))
@@ -750,7 +750,7 @@ contains
         real(wp), dimension(num_dims) :: vel
         integer :: i
 
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_dims
             vel(i) = q_prim_vf(i + contxe)%sf(cell(1), cell(2), cell(3))
         end do
@@ -831,7 +831,7 @@ contains
 
         !< Find current bubble cell
         cell(:) = int(scoord(:))
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_dims
             if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
         end do
@@ -922,11 +922,11 @@ contains
             charpres2 = 0._wp
             vol = 0._wp
 
-            !$acc loop seq
+            $:LOOP()
             do i = 1, smearGrid
-                !$acc loop seq
+                $:LOOP()
                 do j = 1, smearGrid
-                    !$acc loop seq
+                    $:LOOP()
                     do k = 1, smearGridz
                         cellaux(1) = cell(1) + i - (mapCells + 1)
                         cellaux(2) = cell(2) + j - (mapCells + 1)
@@ -1648,7 +1648,7 @@ contains
 
         integer :: i
 
-        !$acc loop seq
+        $:LOOP()
         do i = bub_id, nBubs - 1
             lag_id(i, 1) = lag_id(i + 1, 1)
             bub_R0(i) = bub_R0(i + 1)
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 73458d56f8..dd26870eb8 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -781,25 +781,25 @@ contains
                     do k = is2%beg, is2%end
 
                         ! Transferring the Primitive Variables
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_dims
                             vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i)
                         end do
 
                         vel_K_sum = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_dims
                             vel_K_sum = vel_K_sum + vel(i)**2._wp
                         end do
 
                         pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx)
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, advxe - E_idx
                             adv(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i)
                         end do
@@ -810,13 +810,13 @@ contains
                             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc, 0, k, r)
                         end if
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             mf(i) = alpha_rho(i)/rho
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = chemxb, chemxe
                                 Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                             end do
@@ -849,39 +849,39 @@ contains
 
                         ! First-Order Spatial Derivatives of Primitive Variables
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             dalpha_rho_ds(i) = 0._wp
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_dims
                             dvel_ds(i) = 0._wp
                         end do
 
                         dpres_ds = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, advxe - E_idx
                             dadv_ds(i) = 0._wp
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_species
                                 dYs_ds(i) = 0._wp
                             end do
                         end if
 
-                        !$acc loop seq
+                        $:LOOP()
                         do j = 0, buff_size
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, contxe
                                 dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* &
                                                    fd_coef_${XYZ}$ (j, cbc_loc) + &
                                                    dalpha_rho_ds(i)
                             end do
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_dims
                                 dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -891,7 +891,7 @@ contains
                             dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* &
                                        fd_coef_${XYZ}$ (j, cbc_loc) + &
                                        dpres_ds
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -899,7 +899,7 @@ contains
                             end do
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_species
                                     dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* &
                                                 fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -926,7 +926,7 @@ contains
                             call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
                             ! Add GRCBC for Subsonic Inflow
                             if (bc_${XYZ}$%grcbc_in) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 2, momxb
                                     L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -936,7 +936,7 @@ contains
                                         L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$)
                                     end if
                                 end if
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = E_idx, advxe - 1
                                     L(i) = c*Ma*(adv(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -976,13 +976,13 @@ contains
                             dpres_dt = -5e-1_wp*(L(advxe) + L(1))
                         end if
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             dalpha_rho_dt(i) = &
                                 -(L(i + 1) - mf(i)*dpres_dt)/(c*c)
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_dims
                             dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* &
                                                   (L(1) - L(advxe))/(2._wp*rho*c) + &
@@ -991,13 +991,13 @@ contains
                         end do
 
                         vel_dv_dt_sum = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_dims
                             vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i)
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_species
                                 dYs_dt(i) = -1._wp*L(chemxb + i - 1)
                             end do
@@ -1005,12 +1005,12 @@ contains
 
                         ! The treatment of void fraction source is unclear
                         if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i) !+ adv(i) * vel(dir_idx(1))/y_cc(n)
                             end do
                         else
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i)
                             end do
@@ -1023,7 +1023,7 @@ contains
                             dgamma_dt = dadv_dt(1)
                             dpi_inf_dt = dadv_dt(2)
                         else
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 drho_dt = drho_dt + dalpha_rho_dt(i)
                                 dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i)
@@ -1033,13 +1033,13 @@ contains
                         end if
 
                         ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, contxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*dalpha_rho_dt(i)
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = momxb, momxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*(vel(i - contxe)*drho_dt &
@@ -1050,14 +1050,14 @@ contains
                             ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990
                             call get_species_enthalpies_rt(T, h_k)
                             sum_Enthalpies = 0._wp
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_species
                                 h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
                                 sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
                             end do
                             flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) &
                                                                    + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies)
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_species
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) &
                                                                                 + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i))
@@ -1073,12 +1073,12 @@ contains
                         end if
 
                         if (riemann_solver == 1) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp
                             end do
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = &
                                     1._wp/max(abs(vel(dir_idx(1))), sgm_eps) &
@@ -1091,13 +1091,13 @@ contains
 
                         else
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + &
                                                                    ds(0)*dadv_dt(i - E_idx)
                             end do
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i)
                             end do
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 487e0ceb64..43581328b7 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -56,7 +56,7 @@ contains
         @:ACC_SETUP_VFs(btensor)
 
         @:ALLOCATE(Gs(1:num_fluids))
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
@@ -112,7 +112,7 @@ contains
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
                         alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l)
                         alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -125,7 +125,7 @@ contains
                     !if ( G <= verysmall ) G_K = 0_wp
 
                     if (G > verysmall) then
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, tensor_size
                             tensora(i) = 0_wp
                         end do
@@ -134,7 +134,7 @@ contains
                         ! number for the tensor 1-3:  dxix_dx, dxiy_dx, dxiz_dx
                         ! 4-6 :                       dxix_dy, dxiy_dy, dxiz_dy
                         ! 7-9 :                       dxix_dz, dxiy_dz, dxiz_dz
-                        !$acc loop seq
+                        $:LOOP()
                         do r = -fd_number, fd_number
                             ! derivatives in the x-direction
                             tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x(r, j)
@@ -168,7 +168,7 @@ contains
                         if (tensorb(tensor_size) > verysmall) then
                             ! STEP 2c: computing the inverse of grad_xi tensor = F
                             ! tensorb is the adjoint, tensora becomes F
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, tensor_size - 1
                                 tensora(i) = tensorb(i)/tensorb(tensor_size)
                             end do
@@ -199,7 +199,7 @@ contains
                             q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - &
                                                            G*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma
                             ! STEP 5c: updating the Cauchy stress conservative scalar field
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, b_size - 1
                                 q_cons_vf(strxb + i - 1)%sf(j, k, l) = &
                                     rho*q_prim_vf(strxb + i - 1)%sf(j, k, l)
@@ -240,7 +240,7 @@ contains
         #:endfor
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        !$acc loop seq
+        $:LOOP()
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
@@ -281,7 +281,7 @@ contains
 
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        !$acc loop seq
+        $:LOOP()
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 68032cb5b8..4c1a101e49 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -119,7 +119,7 @@ contains
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
-                        !$acc loop seq
+                        $:LOOP()
                         do r = -fd_number, fd_number
                             du_dx(k, l, q) = du_dx(k, l, q) &
                                              + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
@@ -145,7 +145,7 @@ contains
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
-                            !$acc loop seq
+                            $:LOOP()
                             do r = -fd_number, fd_number
                                 du_dy(k, l, q) = du_dy(k, l, q) &
                                                  + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
@@ -177,7 +177,7 @@ contains
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
-                                !$acc loop seq
+                                $:LOOP()
                                 do r = -fd_number, fd_number
                                     du_dz(k, l, q) = du_dz(k, l, q) &
                                                      + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 31fd99fe99..d118a2fa55 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -190,7 +190,7 @@ contains
             dyn_pres = 0._wp
 
             ! Set q_prim_vf params at GP so that mixture vars calculated properly
-            !$acc loop seq
+            $:LOOP()
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -226,7 +226,7 @@ contains
             end if
 
             ! Set momentum
-            !$acc loop seq
+            $:LOOP()
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -234,7 +234,7 @@ contains
             end do
 
             ! Set continuity and adv vars
-            !$acc loop seq
+            $:LOOP()
             do q = 1, num_fluids
                 q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -288,7 +288,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                !$acc loop seq
+                $:LOOP()
                 do q = intxb, intxe
                     q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP &
                                                                         + pi_infs(q - intxb + 1))
@@ -314,7 +314,7 @@ contains
                 physical_loc = [x_cc(j), y_cc(k), 0._wp]
             end if
 
-            !$acc loop seq
+            $:LOOP()
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -329,7 +329,7 @@ contains
 
             dyn_pres = 0._wp
 
-            !$acc loop seq
+            $:LOOP()
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -798,11 +798,11 @@ contains
             end if
         end if
 
-        !$acc loop seq
+        $:LOOP()
         do i = i1, i2
-            !$acc loop seq
+            $:LOOP()
             do j = j1, j2
-                !$acc loop seq
+                $:LOOP()
                 do k = k1, k2
 
                     coeff = gp%interp_coeffs(i - i1 + 1, j - j1 + 1, k - k1 + 1)
@@ -810,13 +810,13 @@ contains
                     pres_IP = pres_IP + coeff* &
                               q_prim_vf(E_idx)%sf(i, j, k)
 
-                    !$acc loop seq
+                    $:LOOP()
                     do q = momxb, momxe
                         vel_IP(q + 1 - momxb) = vel_IP(q + 1 - momxb) + coeff* &
                                                 q_prim_vf(q)%sf(i, j, k)
                     end do
 
-                    !$acc loop seq
+                    $:LOOP()
                     do l = contxb, contxe
                         alpha_rho_IP(l) = alpha_rho_IP(l) + coeff* &
                                           q_prim_vf(l)%sf(i, j, k)
@@ -829,7 +829,7 @@ contains
                     end if
 
                     if (bubbles_euler .and. .not. qbmm) then
-                        !$acc loop seq
+                        $:LOOP()
                         do l = 1, nb
                             if (polytropic) then
                                 r_IP(l) = r_IP(l) + coeff*q_prim_vf(bubxb + (l - 1)*2)%sf(i, j, k)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 8e6ea6c820..a096a4fe04 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -83,16 +83,16 @@ contains
                 do k = 0, m
 
                     divB = 0._wp
-                    !$acc loop seq
+                    $:LOOP()
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
                     end do
-                    !$acc loop seq
+                    $:LOOP()
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
                     end do
                     if (p > 0) then
-                        !$acc loop seq
+                        $:LOOP()
                         do r = -fd_number, fd_number
                             divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
                         end do
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index f9ddd1f2d1..075334a2d0 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -488,7 +488,7 @@ contains
 
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
                         j = bubxb
-                        !$acc loop seq
+                        $:LOOP()
                         do k = 1, nb
                             rhs_vf(j)%sf(i, q, l) = &
                                 rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
@@ -876,10 +876,10 @@ contains
 
                         nbub = q_cons_vf(bubxb)%sf(id1, id2, id3)
 
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, nb
                             !Initialize moment set for each R0 bin
-                            !$acc loop seq
+                            $:LOOP()
                             do r = 2, nmom
                                 moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3)
                             end do
@@ -890,13 +890,13 @@ contains
 
                             if (polytropic) then
                                 !Account for bubble pressure pb0 at each R0 bin
-                                !$acc loop seq
+                                $:LOOP()
                                 do j = 1, nnode
                                     wght_pb(j, q) = wght(j, q)*(pb0(q) - pv)
                                 end do
                             else
                                 !Account for bubble pressure, mass transfer rate and heat transfer rate in wght_pb, wght_mv and wght_ht using Preston model
-                                !$acc loop seq
+                                $:LOOP()
                                 do j = 1, nnode
                                     chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp))
                                     x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw)
@@ -919,13 +919,13 @@ contains
 
                             !Compute change in moments due to bubble dynamics
                             r = 1
-                            !$acc loop seq
+                            $:LOOP()
                             do i2 = 0, 2
-                                !$acc loop seq
+                                $:LOOP()
                                 do i1 = 0, 2
                                     if ((i1 + i2) <= 2) then
                                         momsum = 0._wp
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do j = 1, nterms
                                             ! Account for term with pb in Rayleigh Plesset equation
                                             if (bubble_model == 3 .and. j == 3) then
@@ -960,7 +960,7 @@ contains
 
                             ! Compute change in pb and mv for non-polytroic model
                             if (.not. polytropic) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do j = 1, nnode
                                     ! Compute Rdot (drdt) at quadrature node in the ODE for pb (note this is not the same as bubble variable Rdot)
                                     drdt = msum(2)
@@ -1010,11 +1010,11 @@ contains
                         end if
 
                     else
-                        !$acc loop seq
+                        $:LOOP()
                         do q = 1, nb
-                            !$acc loop seq
+                            $:LOOP()
                             do i1 = 0, 2
-                                !$acc loop seq
+                                $:LOOP()
                                 do i2 = 0, 2
                                     moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp
                                 end do
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 730ccba98b..9f96010e56 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -652,11 +652,11 @@ contains
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         alf_sum%sf(j, k, l) = 0._wp
-                        !$acc loop seq
+                        $:LOOP()
                         do i = advxb, advxe - 1
                             alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l)
                         end do
-                        !$acc loop seq
+                        $:LOOP()
                         do i = advxb, advxe - 1
                             q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) &
                                                           /alf_sum%sf(j, k, l)
@@ -1462,7 +1462,7 @@ contains
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        !$acc loop seq
+                        $:LOOP()
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* &
@@ -1511,7 +1511,7 @@ contains
                     $:PARALLEL_LOOP(collapse=2)
                     do l = 0, p
                         do j = 0, m
-                            !$acc loop seq
+                            $:LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, 0, l) = &
                                     rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* &
@@ -1527,7 +1527,7 @@ contains
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
-                            !$acc loop seq
+                            $:LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1543,7 +1543,7 @@ contains
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
-                            !$acc loop seq
+                            $:LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1564,7 +1564,7 @@ contains
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1579,7 +1579,7 @@ contains
                         $:PARALLEL_LOOP(collapse=2)
                         do l = 0, p
                             do j = 0, m
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, 0, l) = &
                                         rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* &
@@ -1594,7 +1594,7 @@ contains
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1629,7 +1629,7 @@ contains
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        !$acc loop seq
+                        $:LOOP()
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* &
@@ -1706,7 +1706,7 @@ contains
                     if (mpp_lim) then
                         sum_alpha = 0._wp
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
                                 (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
@@ -1720,7 +1720,7 @@ contains
                             sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
                         end do
@@ -1731,7 +1731,7 @@ contains
                     ! Is the pressure relaxation procedure necessary?
                     relax = 1
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
                         if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) relax = 0
                     end do
@@ -1740,7 +1740,7 @@ contains
                         ! Initial state
                         pres_relax = 0._wp
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                                 pres_K_init(i) = &
@@ -1760,12 +1760,12 @@ contains
                         f_pres = 1e-9_wp
                         df_pres = 1e9_wp
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             rho_K_s(i) = 0._wp
                         end do
 
-                        !$acc loop seq
+                        $:LOOP()
                         do iter = 0, 49
 
                             if (abs(f_pres) > 1e-10_wp) then
@@ -1781,7 +1781,7 @@ contains
                                 f_pres = -1._wp
                                 df_pres = 0._wp
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                                         rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
@@ -1801,7 +1801,7 @@ contains
                         end do
 
                         ! Cell update of the volume fraction
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
                                 q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l) &
@@ -1817,7 +1817,7 @@ contains
                     ! However, the internal-energy equations should be reset with the corresponding mixture
                     ! pressure from the correction. This step is carried out below.
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
                         alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
                         alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
@@ -1829,14 +1829,14 @@ contains
                         pi_inf = 0._wp
 
                         if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho = rho + alpha_rho(i)
                                 gamma = gamma + alpha(i)*gammas(i)
                                 pi_inf = pi_inf + alpha(i)*pi_infs(i)
                             end do
                         else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids - 1
                                 rho = rho + alpha_rho(i)
                                 gamma = gamma + alpha(i)*gammas(i)
@@ -1855,7 +1855,7 @@ contains
                         sum_alpha = 0._wp
 
                         if (mpp_lim) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 alpha_rho(i) = max(0._wp, alpha_rho(i))
                                 alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
@@ -1866,7 +1866,7 @@ contains
 
                         end if
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             rho = rho + alpha_rho(i)
                             gamma = gamma + alpha(i)*gammas(i)
@@ -1874,12 +1874,12 @@ contains
                         end do
 
                         if (viscous) then
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, 2
                                 Re(i) = dflt_real
 
                                 if (Re_size(i) > 0) Re(i) = 0._wp
-                                !$acc loop seq
+                                $:LOOP()
                                 do q = 1, Re_size(i)
                                     Re(i) = alpha(Re_idx(i, q))/Res(i, q) &
                                             + Re(i)
@@ -1893,7 +1893,7 @@ contains
 
                     dyn_pres = 0._wp
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = momxb, momxe
                         dyn_pres = dyn_pres + 5e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
                                    q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
@@ -1901,7 +1901,7 @@ contains
 
                     pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
 
-                    !$acc loop seq
+                    $:LOOP()
                     do i = 1, num_fluids
                         q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
                             q_cons_vf(i + advxb - 1)%sf(j, k, l)* &
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index c496ce7632..a6d1c80a77 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -374,13 +374,13 @@ contains
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, contxe
                                 alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                 alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                             end do
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_vels
                                 vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                 vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -388,13 +388,13 @@ contains
 
                             vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_vels
                                 vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                 vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                             end do
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                 alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -438,7 +438,7 @@ contains
                             pres_mag%R = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
                                     alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
@@ -447,7 +447,7 @@ contains
 
                                 alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
                                     alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
@@ -457,7 +457,7 @@ contains
                                 alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho_L = rho_L + alpha_rho_L(i)
                                 gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -471,13 +471,13 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_L(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res(i, q) &
                                                   + Re_L(i)
@@ -487,13 +487,13 @@ contains
 
                                 end do
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_R(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_R(i) = alpha_R(Re_idx(i, q))/Res(i, q) &
                                                   + Re_R(i)
@@ -504,7 +504,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = chemxb, chemxe
                                     Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -601,7 +601,7 @@ contains
                             if (hypoelasticity) then
                                 G_L = 0._wp; G_R = 0._wp
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     G_L = G_L + alpha_L(i)*Gs(i)
                                     G_R = G_R + alpha_R(i)*Gs(i)
@@ -634,7 +634,7 @@ contains
                             !    G_L = 0._wp
                             !    G_R = 0._wp
                             !
-                            !    !$acc loop seq
+                            !    $:LOOP()
                             !    do i = 1, num_fluids
                             !        G_L = G_L + alpha_L(i)*Gs(i)
                             !        G_R = G_R + alpha_R(i)*Gs(i)
@@ -643,17 +643,17 @@ contains
                             !    if ((G_L > 1e-3_wp) .and. (G_R > 1e-3_wp)) then
                             !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                             !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                            !    !$acc loop seq
+                            !    $:LOOP()
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                             !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                             !    end do
-                            !    !$acc loop seq
+                            !    $:LOOP()
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = 0_wp
                             !        tau_e_R(i) = 0_wp
                             !    end do
-                            !    !$acc loop seq
+                            !    $:LOOP()
                             !    do i = 1, num_dims
                             !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                             !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
@@ -681,7 +681,7 @@ contains
                             end if
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                 end do
@@ -760,7 +760,7 @@ contains
 
                             ! Mass
                             if (.not. relativity) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -770,7 +770,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (relativity) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -851,7 +851,7 @@ contains
                                      + s_M*s_P*(cm%L(3) - cm%R(3))) &
                                     /(s_M - s_P)
                             elseif (bubbles_euler) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -866,7 +866,7 @@ contains
                                         + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
                                 end do
                             else if (hypoelasticity) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -882,7 +882,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             else
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -978,7 +978,7 @@ contains
                             end if
 
                             ! Advection
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = &
                                     (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1003,7 +1003,7 @@ contains
                             !end if
 
                             ! Div(U)?
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_vels
                                 vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                     (xi_M*(rho_L*vel_L(dir_idx(i))* &
@@ -1024,7 +1024,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = chemxb, chemxe
                                     Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -1074,7 +1074,7 @@ contains
                             #:if (NORM_DIR == 2)
                                 if (cyl_coord) then
                                     !Substituting the advective flux into the inviscid geometrical source flux
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, E_idx
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1083,7 +1083,7 @@ contains
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
                                         - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
                                     ! Geometrical source of the void fraction(s) is zero
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = advxb, advxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1096,7 +1096,7 @@ contains
                                         (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
                                         /(s_M - s_P)
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = strxb, strxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1312,7 +1312,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -1337,32 +1337,32 @@ contains
                                 alpha_R_sum = 0._wp
 
                                 if (mpp_lim) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1379,13 +1379,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -1395,13 +1395,13 @@ contains
 
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -1417,18 +1417,18 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0_wp; G_R = 0_wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -1445,13 +1445,13 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0_wp; G_R = 0_wp; 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -1462,7 +1462,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -1486,7 +1486,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -1581,7 +1581,7 @@ contains
 
                                 ! COMPUTING FLUXES
                                 ! MASS FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + &
@@ -1590,7 +1590,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* &
@@ -1606,7 +1606,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0_wp; 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -1624,7 +1624,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
@@ -1632,7 +1632,7 @@ contains
                                 end do
 
                                 ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -1642,7 +1642,7 @@ contains
 
                                 ! INTERNAL ENERGIES ADVECTION FLUX.
                                 ! K-th pressure and velocity in preparation for the internal energy flux
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1_wp + gammas(i)))* &
                                                             xi_L**(1_wp/gammas(i) + 1_wp) - pi_infs(i)/(1_wp + gammas(i)) - pres_L) + pres_L) + &
@@ -1661,7 +1661,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -1671,7 +1671,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -1692,11 +1692,11 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = intxb, intxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1704,7 +1704,7 @@ contains
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0_wp
                                         end do
@@ -1712,7 +1712,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0_wp
                                         end do
@@ -1737,26 +1737,26 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                 end do
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                     vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                                 end do
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1769,7 +1769,7 @@ contains
                                 gamma_L = 0._wp
                                 pi_inf_L = 0._wp
                                 qv_L = 0._wp
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + alpha_rho_L(i)
                                     gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -1781,7 +1781,7 @@ contains
                                 gamma_R = 0._wp
                                 pi_inf_R = 0._wp
                                 qv_R = 0._wp
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_R = rho_R + alpha_rho_R(i)
                                     gamma_R = gamma_R + alpha_R(i)*gammas(i)
@@ -1856,7 +1856,7 @@ contains
                                 xi_M = (5e-1_wp + sign(5e-1_wp, s_S))
                                 xi_P = (5e-1_wp - sign(5e-1_wp, s_S))
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*alpha_rho_L(i) &
@@ -1867,7 +1867,7 @@ contains
 
                                 ! Momentum flux.
                                 ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -1886,7 +1886,7 @@ contains
 
                                 if (bubbles_euler) then
                                     ! Put p_tilde in
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                             flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
@@ -1897,7 +1897,7 @@ contains
 
                                 flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = alf_idx, alf_idx !only advect the void fraction
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1907,7 +1907,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
 
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
@@ -1918,7 +1918,7 @@ contains
 
                                 ! Add advection flux for bubble variables
                                 if (bubbles_euler) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = bubxb, bubxe
                                         flux_rs${XYZ}$_vf(j, k, l, i) = &
                                             xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1933,7 +1933,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1950,7 +1950,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1958,7 +1958,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1992,7 +1992,7 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -2000,7 +2000,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2018,7 +2018,7 @@ contains
 
                                 ! Retain this in the refactor
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2026,7 +2026,7 @@ contains
                                         qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids - 1
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2046,7 +2046,7 @@ contains
                                 qv_R = 0._wp
 
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2054,7 +2054,7 @@ contains
                                         qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids - 1
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2070,13 +2070,13 @@ contains
 
                                 if (viscous) then
                                     if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, 2
                                             Re_L(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                            !$acc loop seq
+                                            $:LOOP()
                                             do q = 1, Re_size(i)
                                                 Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_L(i)
@@ -2086,13 +2086,13 @@ contains
 
                                         end do
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, 2
                                             Re_R(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                            !$acc loop seq
+                                            $:LOOP()
                                             do q = 1, Re_size(i)
                                                 Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_R(i)
@@ -2111,7 +2111,7 @@ contains
                                 H_R = (E_R + pres_R)/rho_R
 
                                 if (avg_state == 2) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, nb
                                         R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
                                         R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
@@ -2131,7 +2131,7 @@ contains
                                         else
                                             nbub_L_denom = 0._wp
                                             nbub_R_denom = 0._wp
-                                            !$acc loop seq
+                                            $:LOOP()
                                             do i = 1, nb
                                                 nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i)
                                                 nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i)
@@ -2145,7 +2145,7 @@ contains
                                         nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
                                     end if
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, nb
                                         if (.not. qbmm) then
                                             if (polytropic) then
@@ -2178,7 +2178,7 @@ contains
                                         R3V2Lbar = 0._wp
                                         R3V2Rbar = 0._wp
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, nb
                                             PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
                                             PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
@@ -2213,7 +2213,7 @@ contains
                                     gamma_avg = 5e-1_wp*(gamma_L + gamma_R)
                                     vel_avg_rms = 0._wp
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
                                     end do
@@ -2232,7 +2232,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2296,7 +2296,7 @@ contains
                                     pcorr = 0._wp
                                 end if
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2315,7 +2315,7 @@ contains
 
                                 ! Include p_tilde
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -2347,7 +2347,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
 
                                 ! Volume fraction flux
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2357,7 +2357,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                         xi_M*(vel_L(dir_idx(i)) + &
@@ -2373,7 +2373,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
 
                                 ! Add advection flux for bubble variables
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = bubxb, bubxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2402,7 +2402,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2419,7 +2419,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2427,7 +2427,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2468,14 +2468,14 @@ contains
 
                                 !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2502,32 +2502,32 @@ contains
                                 ! Change this by splitting it into the cases
                                 ! present in the bubbles_euler
                                 if (mpp_lim) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2541,13 +2541,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -2557,13 +2557,13 @@ contains
 
                                     end do
 
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -2575,7 +2575,7 @@ contains
 
                                 if (chemistry) then
                                     c_sum_Yi_Phi = 0.0_wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = chemxb, chemxe
                                         Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2634,19 +2634,19 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0_wp
                                     G_R = 0_wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -2663,14 +2663,14 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0_wp
                                     G_R = 0_wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -2681,7 +2681,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -2705,7 +2705,7 @@ contains
                                                               vel_avg_rms, c_sum_Yi_Phi, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2781,7 +2781,7 @@ contains
 
                                 ! COMPUTING THE HLLC FLUXES
                                 ! MASS FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2792,7 +2792,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = &
@@ -2827,7 +2827,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0_wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -2846,7 +2846,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -2855,7 +2855,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2865,7 +2865,7 @@ contains
                                 end do
 
                                 ! VOLUME FRACTION SOURCE FLUX.
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -2888,7 +2888,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -2901,7 +2901,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1)
 
                                 if (chemistry) then
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do i = chemxb, chemxe
                                         Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2916,7 +2916,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2933,7 +2933,7 @@ contains
                                                                       (1._wp - dir_flg(idx1))* &
                                                                       vel_R(idx1)) - vel_R(idx1))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2941,7 +2941,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -3134,7 +3134,7 @@ contains
                             ! Sum properties of all fluid components
                             rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
                             rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho%L = rho%L + alpha_rho_L(i)
                                 gamma%L = gamma%L + alpha_L(i)*gammas(i)
@@ -3304,7 +3304,7 @@ contains
                             ! Energy
                             flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
                             ! Partial fraction
-                            !$acc loop seq
+                            $:LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
                             end do
@@ -4017,7 +4017,7 @@ contains
 
                     ! Average velocities and their derivatives at the interface
                     ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
-                    !$acc loop seq
+                    $:LOOP()
                     do i_vel = 1, num_dims
                         avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
 
@@ -4095,7 +4095,7 @@ contains
                             end if
                         end select
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i_vel = 1, num_dims
                             flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
                             flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 85a00f7b13..49f1a5e7d7 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -49,7 +49,7 @@ contains
 
         integer :: i
 
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_fluids
             alpha_rho(i) = q_prim_vf(i)%sf(j, k, l)
             alpha(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
@@ -64,13 +64,13 @@ contains
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re, j, k, l)
         end if
 
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_vels
             vel(i) = q_prim_vf(contxe + i)%sf(j, k, l)
         end do
 
         vel_sum = 0._wp
-        !$acc loop seq
+        $:LOOP()
         do i = 1, num_vels
             vel_sum = vel_sum + vel(i)**2._wp
         end do
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index cd48b01e50..84eb8a4329 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -82,7 +82,7 @@ contains
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
-                    !$acc loop seq
+                    $:LOOP()
                     do i = momxb, E_idx
                         tau_Re_vf(i)%sf(j, k, l) = 0._wp
                     end do
@@ -96,7 +96,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -112,14 +112,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -138,7 +138,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -149,7 +149,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -157,12 +157,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -182,7 +182,7 @@ contains
                                         - 2._wp*grad_x_vf(1)%sf(j, k, l) &
                                         - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
                                        (3._wp*Re_visc(1))
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, 2
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -204,7 +204,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -220,14 +220,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -246,7 +246,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -257,7 +257,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -265,12 +265,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -309,7 +309,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -325,14 +325,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -351,7 +351,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -362,7 +362,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -370,12 +370,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -395,7 +395,7 @@ contains
                                         y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
                                        Re_visc(1)
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 2, 3
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -418,7 +418,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -434,14 +434,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -460,7 +460,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -471,7 +471,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -479,12 +479,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -598,7 +598,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
-                        !$acc loop seq
+                        $:LOOP()
                         do i = iv%beg, iv%end
                             dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j, k, l) - &
@@ -613,7 +613,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
-                        !$acc loop seq
+                        $:LOOP()
                         do i = iv%beg, iv%end
                             dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
@@ -630,7 +630,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j, l) - &
@@ -645,7 +645,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
@@ -660,7 +660,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
@@ -679,7 +679,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
@@ -699,7 +699,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
@@ -719,7 +719,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
@@ -741,7 +741,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -757,7 +757,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -773,7 +773,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -794,7 +794,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -815,7 +815,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -836,7 +836,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -857,7 +857,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -878,7 +878,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -898,7 +898,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
@@ -918,7 +918,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = iv%beg, iv%end
                                     dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
                                         (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
@@ -1215,7 +1215,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
-                        !$acc loop seq
+                        $:LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(j)) &
@@ -1243,7 +1243,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
-                        !$acc loop seq
+                        $:LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(k)) &
@@ -1271,7 +1271,7 @@ contains
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
-                        !$acc loop seq
+                        $:LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(l)) &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 1915254cc3..830c852c83 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -799,7 +799,7 @@ contains
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
@@ -919,7 +919,7 @@ contains
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                !$acc loop seq
+                                $:LOOP()
                                 do i = 1, v_size
 
                                     if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity

From 8061a0d51c40f8779b6030e7aaf485dc7ac1d262 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 12:51:07 -0400
Subject: [PATCH 18/75] Added declare macros

---
 src/common/m_boundary_common.fpp       |   2 +-
 src/common/m_mpi_common.fpp            |   7 +-
 src/common/m_phase_change.fpp          |   8 +-
 src/common/m_variables_conversion.fpp  |   6 +-
 src/simulation/m_acoustic_src.fpp      |  26 ++++---
 src/simulation/m_body_forces.fpp       |   2 +-
 src/simulation/m_bubbles.fpp           |   2 +-
 src/simulation/m_bubbles_EE.fpp        |   6 +-
 src/simulation/m_bubbles_EL.fpp        |  15 ++--
 src/simulation/m_cbc.fpp               |  25 +++---
 src/simulation/m_data_output.fpp       |   5 +-
 src/simulation/m_fftw.fpp              |   4 +-
 src/simulation/m_global_parameters.fpp | 104 ++++++++++++++-----------
 src/simulation/m_hyperelastic.fpp      |   6 +-
 src/simulation/m_hypoelastic.fpp       |   8 +-
 src/simulation/m_ibm.fpp               |   6 +-
 src/simulation/m_mhd.fpp               |   4 +-
 src/simulation/m_mpi_proxy.fpp         |   3 +-
 src/simulation/m_qbmm.fpp              |   8 +-
 src/simulation/m_rhs.fpp               |  40 +++++-----
 src/simulation/m_riemann_solvers.fpp   |  17 ++--
 src/simulation/m_surface_tension.fpp   |   6 +-
 src/simulation/m_time_steppers.fpp     |   2 +-
 src/simulation/m_viscous.fpp           |   4 +-
 src/simulation/m_weno.fpp              |  17 ++--
 25 files changed, 177 insertions(+), 156 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index a3460a3950..f793c94e4d 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -21,7 +21,7 @@ module m_boundary_common
     implicit none
 
     type(scalar_field), dimension(:, :), allocatable :: bc_buffers
-    !$acc declare create(bc_buffers)
+    $:DECLARE(create=["bc_buffers"])
 
     real(wp) :: bcxb, bcxe, bcyb, bcye, bczb, bcze
 
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 4c84b6ff8c..1706b848c2 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -1,5 +1,6 @@
 
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
@@ -25,7 +26,7 @@ module m_mpi_common
     implicit none
 
     integer, private :: err_code, ierr, v_size !<
-    !$acc declare create(v_size)
+    $:DECLARE(create=["v_size"])
     !! Generic flags used to identify and report MPI errors
 
     real(wp), private, allocatable, dimension(:), target :: buff_send !<
@@ -38,10 +39,10 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    !$acc declare create(buff_send, buff_recv)
+    $:DECLARE(create=["buff_send","buff_recv"])
 
     integer :: halo_size, nVars
-    !$acc declare create(halo_size, nVars)
+    $:DECLARE(create=["halo_size","nVars"])
 
 contains
 
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 89d4e412fe..7725743e78 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -41,7 +41,7 @@ module m_phase_change
     real(wp) :: A, B, C, D
     !> @}
 
-    !$acc declare create(max_iter,pCr,TCr,mixM,lp,vp,A,B,C,D)
+    $:DECLARE(create=["max_iter","pCr","TCr","mixM","lp","vp","A","B","C","D"])
 
 contains
 
@@ -88,15 +88,15 @@ contains
         real(wp) :: rho, rM, m1, m2, MCT !< total density, total reacting mass, individual reacting masses
         real(wp) :: TvF !< total volume fraction
 
-        !$acc declare create(pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF)
+        $:DECLARE(create=["pS","pSOV","pSSL","TS","TSOV","TSSL","TSatOV","TSatSL"])
+        $:DECLARE(create=["rhoe","dynE","rhos","rho","rM","m1","m2","MCT","TvF"])
 
         real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+        $:DECLARE(create=["p_infOV","p_infpT","p_infSL","sk","hk","gk","ek","rhok"])
 
         !< Generic loop iterators
         integer :: i, j, k, l
 
-        !$acc declare create(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok)
-
         ! starting equilibrium solver
         $:PARALLEL_LOOP(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
             "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 4634ee0e34..d75d69528b 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -51,16 +51,16 @@ module m_variables_conversion
     !! In simulation, gammas, pi_infs, and qvs are already declared in m_global_variables
 #ifndef MFC_SIMULATION
     real(wp), allocatable, public, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps)
+    $:DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
 #endif
 
     real(wp), allocatable, dimension(:) :: Gs
     integer, allocatable, dimension(:) :: bubrs
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(bubrs, Gs, Res)
+    $:DECLARE(create=["bubrs","Gs","Res"])
 
     integer :: is1b, is2b, is3b, is1e, is2e, is3e
-    !$acc declare create(is1b, is2b, is3b, is1e, is2e, is3e)
+    $:DECLARE(create=["is1b","is2b","is3b","is1e","is2e","is3e"])
 
     real(wp), allocatable, dimension(:, :, :), public :: rho_sf !< Scalar density function
     real(wp), allocatable, dimension(:, :, :), public :: gamma_sf !< Scalar sp. heat ratio function
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 73d1f14fb8..0a03a1b7f9 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -24,41 +24,43 @@ module m_acoustic_src
     private; public :: s_initialize_acoustic_src, s_precalculate_acoustic_spatial_sources, s_acoustic_src_calculations
 
     integer, allocatable, dimension(:) :: pulse, support
-    !$acc declare create(pulse, support)
+    $:DECLARE(create=["pulse","support"])
 
     logical, allocatable, dimension(:) :: dipole
-    !$acc declare create(dipole)
+    $:DECLARE(create=["dipole"])
 
     real(wp), allocatable, target, dimension(:, :) :: loc_acoustic
-    !$acc declare create(loc_acoustic)
+    $:DECLARE(create=["loc_acoustic"])
 
-    real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
-    !$acc declare create(mag, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay)
+    real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency
+    real(wp), allocatable, dimension(:) :: gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
+    $:DECLARE(create=["mag","length","height","wavelength","frequency"])
+    $:DECLARE(create=["gauss_sigma_dist","gauss_sigma_time","npulse","dir","delay"])
 
     real(wp), allocatable, dimension(:) :: foc_length, aperture
-    !$acc declare create(foc_length, aperture)
+    $:DECLARE(create=["foc_length","aperture"])
 
     real(wp), allocatable, dimension(:) :: element_spacing_angle, element_polygon_ratio, rotate_angle
-    !$acc declare create(element_spacing_angle, element_polygon_ratio, rotate_angle)
+    $:DECLARE(create=["element_spacing_angle","element_polygon_ratio","rotate_angle"])
 
     real(wp), allocatable, dimension(:) :: bb_bandwidth, bb_lowest_freq
-    !$acc declare create(bb_bandwidth, bb_lowest_freq)
+    $:DECLARE(create=["bb_bandwidth","bb_lowest_freq"])
 
     integer, allocatable, dimension(:) :: num_elements, element_on, bb_num_freq
-    !$acc declare create(num_elements, element_on, bb_num_freq)
+    $:DECLARE(create=["num_elements","element_on","bb_num_freq"])
 
     !> @name Acoustic source terms
     !> @{
     real(wp), allocatable, dimension(:, :, :) :: mass_src, e_src
     real(wp), allocatable, dimension(:, :, :, :) :: mom_src
     !> @}
-    !$acc declare create(mass_src, e_src, mom_src)
+    $:DECLARE(create=["mass_src","e_src","mom_src"])
 
     integer, dimension(:), allocatable :: source_spatials_num_points !< Number of non-zero source grid points for each source
-    !$acc declare create(source_spatials_num_points)
+    $:DECLARE(create=["source_spatials_num_points"])
 
     type(source_spatial_type), dimension(:), allocatable :: source_spatials !< Data of non-zero source grid points for each source
-    !$acc declare create(source_spatials)
+    $:DECLARE(create=["source_spatials"])
 
 contains
 
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 4080a7ddf1..17e700fcfe 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -23,7 +23,7 @@ module m_body_forces
               s_finalize_body_forces_module
 
     real(wp), allocatable, dimension(:, :, :) :: rhoM
-    !$acc declare create(rhoM)
+    $:DECLARE(create=["rhoM"])
 
 contains
 
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index b64c28a4ee..11a7c075d2 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -21,7 +21,7 @@ module m_bubbles
     real(wp) :: chi_vw  !< Bubble wall properties (Ando 2010)
     real(wp) :: k_mw    !< Bubble wall properties (Ando 2010)
     real(wp) :: rho_mw  !< Bubble wall properties (Ando 2010)
-    !$acc declare create(chi_vw, k_mw, rho_mw)
+    $:DECLARE(create=["chi_vw","k_mw","rho_mw"])
 
 contains
 
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 84ac15ecb3..fd770ebcf1 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -22,13 +22,13 @@ module m_bubbles_EE
 
     real(wp), allocatable, dimension(:, :, :) :: bub_adv_src
     real(wp), allocatable, dimension(:, :, :, :) :: bub_r_src, bub_v_src, bub_p_src, bub_m_src
-    !$acc declare create(bub_adv_src, bub_r_src, bub_v_src, bub_p_src, bub_m_src)
+    $:DECLARE(create=["bub_adv_src","bub_r_src","bub_v_src","bub_p_src","bub_m_src"])
 
     type(scalar_field) :: divu !< matrix for div(u)
-    !$acc declare create(divu)
+    $:DECLARE(create=["divu"])
 
     integer, allocatable, dimension(:) :: rs, vs, ms, ps
-    !$acc declare create(rs, vs, ms, ps)
+    $:DECLARE(create=["rs","vs","ms","ps"])
 
 contains
 
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 4fa66a2757..af9aa96436 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -33,20 +33,26 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:) :: bub_R0            !< Initial bubble radius
     real(wp), allocatable, dimension(:) :: Rmax_stats        !< Maximum radius
     real(wp), allocatable, dimension(:) :: Rmin_stats        !< Minimum radius
+    $:DECLARE(create=["lag_id", "bub_R0", "Rmax_stats", "Rmin_stats"])
+
     real(wp), allocatable, dimension(:) :: gas_mg            !< Bubble's gas mass
     real(wp), allocatable, dimension(:) :: gas_betaT         !< heatflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: gas_betaC         !< massflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: bub_dphidt        !< subgrid velocity potential (Maeda & Colonius, 2018)
+    $:DECLARE(create=["gas_mg", "gas_betaT", "gas_betaC", "bub_dphidt"])
+
     !(nBub, 1 -> actual val or 2 -> temp val)
     real(wp), allocatable, dimension(:, :) :: gas_p          !< Pressure in the bubble
     real(wp), allocatable, dimension(:, :) :: gas_mv         !< Vapor mass in the bubble
     real(wp), allocatable, dimension(:, :) :: intfc_rad      !< Bubble radius
     real(wp), allocatable, dimension(:, :) :: intfc_vel      !< Velocity of the bubble interface
+    $:DECLARE(create=["gas_p", "gas_mv", "intfc_rad", "intfc_vel"])
     !(nBub, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
     real(wp), allocatable, dimension(:, :, :) :: mtn_pos     !< Bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_posPrev !< Bubble's previous position
     real(wp), allocatable, dimension(:, :, :) :: mtn_vel     !< Bubble's velocity
     real(wp), allocatable, dimension(:, :, :) :: mtn_s       !< Bubble's computational cell position in real format
+    $:DECLARE(create=["mtn_pos", "mtn_posPrev", "mtn_vel", "mtn_s"])
     !(nBub, 1-> x or 2->y or 3 ->z, time-stage)
     real(wp), allocatable, dimension(:, :) :: intfc_draddt   !< Time derivative of bubble's radius
     real(wp), allocatable, dimension(:, :) :: intfc_dveldt   !< Time derivative of bubble's interface velocity
@@ -54,21 +60,18 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:, :) :: gas_dmvdt      !< Time derivative of the vapor mass in the bubble
     real(wp), allocatable, dimension(:, :, :) :: mtn_dposdt  !< Time derivative of the bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_dveldt  !< Time derivative of the bubble's velocity
-
-    !$acc declare create(lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt,       &
-    !$acc gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
-    !$acc gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt)
+    $:DECLARE(create=["intfc_draddt", "intfc_dveldt", "gas_dpdt", "gas_dmvdt", "mtn_dposdt", "mtn_dveldt"])
 
     integer, private :: lag_num_ts                                  !<  Number of time stages in the time-stepping scheme
 
-    !$acc declare create(lag_num_ts)
+    $:DECLARE(create=["lag_num_ts"])
 
     integer :: nBubs                            !< Number of bubbles in the local domain
     real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
     type(vector_field) :: q_beta                !< Projection of the lagrangian particles in the Eulerian framework
     integer :: q_beta_idx                       !< Size of the q_beta vector field
 
-    !$acc declare create(nBubs, Rmax_glb, Rmin_glb, q_beta, q_beta_idx)
+    $:DECLARE(create=["nBubs","Rmax_glb","Rmin_glb","q_beta","q_beta_idx"])
 
 contains
 
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index dd26870eb8..28fe01bea8 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -49,6 +49,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
+    $:DECLARE(create=["q_prim_rsx_vf","q_prim_rsy_vf","q_prim_rsz_vf"])
 
     type(scalar_field), allocatable, dimension(:) :: F_rs_vf, F_src_rs_vf !<
 
@@ -58,6 +59,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsx_vf, F_src_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsy_vf, F_src_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsz_vf, F_src_rsz_vf !<
+    $:DECLARE(create=["F_rsx_vf","F_src_rsx_vf","F_rsy_vf","F_src_rsy_vf","F_rsz_vf","F_src_rsz_vf"])
 
     !! There is a CCE bug that is causing some subset of these variables to interfere
     !! with variables of the same name in m_riemann_solvers.fpp, and giving this versions
@@ -68,13 +70,14 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf_l, flux_src_rsx_vf_l !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf_l, flux_src_rsy_vf_l
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf_l, flux_src_rsz_vf_l
+    $:DECLARE(create=["flux_rsx_vf_l","flux_src_rsx_vf_l","flux_rsy_vf_l","flux_src_rsy_vf_l","flux_rsz_vf_l","flux_src_rsz_vf_l"])
 
     real(wp) :: c           !< Cell averaged speed of sound
     real(wp), dimension(2) :: Re          !< Cell averaged Reynolds numbers
-    !$acc declare create(c, Re)
+    $:DECLARE(create=["c","Re"])
 
     real(wp) :: dpres_ds !< Spatial derivatives in s-dir of pressure
-    !$acc declare create(dpres_ds)
+    $:DECLARE(create=["dpres_ds"])
 
     real(wp), allocatable, dimension(:) :: ds !< Cell-width distribution in the s-direction
 
@@ -94,18 +97,21 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_y !< Polynomial interpolant coefficients in y-dir
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_z !< Polynomial interpolant coefficients in z-dir
 
+    $:DECLARE(create=["ds","fd_coef_x","fd_coef_y","fd_coef_z","pi_coef_x","pi_coef_y","pi_coef_z"])
+
     !! The first dimension of the array identifies the polynomial, the
     !! second dimension identifies the position of its coefficients and the last
     !! dimension denotes the location of the CBC.
 
     type(int_bounds_info) :: is1, is2, is3 !< Indical bounds in the s1-, s2- and s3-directions
-    !$acc declare create(is1, is2, is3)
+    $:DECLARE(create=["is1","is2","is3"])
 
     integer :: dj
     integer :: bcxb, bcxe, bcyb, bcye, bczb, bcze
     integer :: cbc_dir, cbc_loc
     integer :: flux_cbc_index
-    !$acc declare create(dj, bcxb, bcxe, bcyb, bcye, bczb, bcze, cbc_dir, cbc_loc, flux_cbc_index)
+    $:DECLARE(create=["dj","bcxb","bcxe","bcyb","bcye","bczb","bcze"])
+    $:DECLARE(create=["cbc_dir", "cbc_loc","flux_cbc_index"])
 
     !! GRCBC inputs for subsonic inflow and outflow conditions consisting of
     !! inflow velocities, pressure, density and void fraction as well as
@@ -114,14 +120,9 @@ module m_cbc
     real(wp), allocatable, dimension(:) :: pres_in, pres_out, Del_in, Del_out
     real(wp), allocatable, dimension(:, :) :: vel_in, vel_out
     real(wp), allocatable, dimension(:, :) :: alpha_rho_in, alpha_in
-    !$acc declare create(pres_in, pres_out, Del_in, Del_out)
-    !$acc declare create(vel_in, vel_out)
-    !$acc declare create(alpha_rho_in, alpha_in)
-
-    !$acc declare create(q_prim_rsx_vf, q_prim_rsy_vf, q_prim_rsz_vf,  F_rsx_vf, F_src_rsx_vf,flux_rsx_vf_l, flux_src_rsx_vf_l, &
-    !$acc                 F_rsy_vf, F_src_rsy_vf,flux_rsy_vf_l, flux_src_rsy_vf_l, F_rsz_vf, F_src_rsz_vf,flux_rsz_vf_l, flux_src_rsz_vf_l, &
-    !$acc                 ds,fd_coef_x,fd_coef_y,fd_coef_z,      &
-    !$acc                 pi_coef_x,pi_coef_y,pi_coef_z)
+    $:DECLARE(create=["pres_in","pres_out","Del_in","Del_out"])
+    $:DECLARE(create=["vel_in","vel_out"])
+    $:DECLARE(create=["alpha_rho_in","alpha_in"])
 
 contains
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index ca0d7dbb8d..7ff2756709 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -55,13 +55,14 @@ module m_data_output
     real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
     real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
     real(wp), public, allocatable, dimension(:, :) :: c_mass
-    !$acc declare create(icfl_sf, vcfl_sf, ccfl_sf, Rc_sf)
+    $:DECLARE(create=["icfl_sf","vcfl_sf","ccfl_sf","Rc_sf"])
 
     real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
     real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
     real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
     real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    !$acc declare create(icfl_max_loc, icfl_max_glb, vcfl_max_loc, vcfl_max_glb, ccfl_max_loc, ccfl_max_glb, Rc_min_loc, Rc_min_glb)
+    $:DECLARE(create=["icfl_max_loc","icfl_max_glb","vcfl_max_loc","vcfl_max_glb"])
+    $:DECLARE(create=["ccfl_max_loc","ccfl_max_glb","Rc_min_loc","Rc_min_glb"])
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 10e063496c..c0cdc226ea 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -47,12 +47,12 @@ module m_fftw
     !! Filtered complex data in Fourier space
 
 #if defined(MFC_OpenACC)
-    !$acc declare create(real_size, cmplx_size, x_size, batch_size, Nfq)
+    $:DECLARE(create=["real_size","cmplx_size","x_size","batch_size","Nfq"])
 
     real(dp), allocatable, target :: data_real_gpu(:)
     complex(dp), allocatable, target :: data_cmplx_gpu(:)
     complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
-!$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu)
+    $:DECLARE(create=["data_real_gpu","data_cmplx_gpu","data_fltr_cmplx_gpu"])
 
 #if defined(__PGI)
     integer :: fwd_plan_gpu, bwd_plan_gpu
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index d0b94f6b7a..348f9d542c 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -4,6 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module contains all of the parameters describing the program
 !!              logistics, the computational domain and the simulation algorithm.
@@ -53,7 +54,7 @@ module m_global_parameters
     logical :: cyl_coord
     integer :: grid_geometry
     !> @}
-    !$acc declare create(cyl_coord, grid_geometry)
+    $:DECLARE(create=["cyl_coord","grid_geometry"])
 
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
@@ -76,7 +77,7 @@ module m_global_parameters
 
     real(wp) :: dt !< Size of the time-step
 
-    !$acc declare create(x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p)
+    $:DECLARE(create=["x_cb","y_cb","z_cb","x_cc","y_cc","z_cc","dx","dy","dz","dt","m","n","p"])
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number
     !! of time-step iterations between successive solution backups, respectively
@@ -90,7 +91,7 @@ module m_global_parameters
     real(wp) :: t_stop, t_save, cfl_target
     integer :: n_start
     !> @}
-    !$acc declare create(cfl_target)
+    $:DECLARE(create=["cfl_target"])
 
     logical :: cfl_adap_dt, cfl_const_dt, cfl_dt
 
@@ -158,7 +159,7 @@ module m_global_parameters
     logical :: bulk_stress   !< Bulk stresses
     logical :: cont_damage   !< Continuum damage modeling
 
-    !$acc declare create(chemistry)
+    $:DECLARE(create=["chemistry"])
 
     logical :: bodyForces
     logical :: bf_x, bf_y, bf_z !< body force toggle in three directions
@@ -169,24 +170,27 @@ module m_global_parameters
         #:endfor
     #:endfor
     real(wp), dimension(3) :: accel_bf
-    !$acc declare create(accel_bf)
+    $:DECLARE(create=["accel_bf"])
 
     integer :: cpu_start, cpu_end, cpu_rate
 
     #:if not MFC_CASE_OPTIMIZATION
-        !$acc declare create(num_dims, num_vels, weno_polyn, weno_order, weno_num_stencils, num_fluids, wenojs, mapped_weno, wenoz, teno, wenoz_q, mhd, relativity)
+        $:DECLARE(create=["num_dims","num_vels","weno_polyn","weno_order"])
+        $:DECLARE(create=["weno_num_stencils","num_fluids","wenojs"])
+        $:DECLARE(create=["mapped_weno", "wenoz","teno","wenoz_q","mhd","relativity"])
     #:endif
 
-    !$acc declare create(mpp_lim, model_eqns, mixture_err, alt_soundspeed, avg_state, mp_weno, weno_eps, teno_CT, hypoelasticity, hyperelasticity, hyper_model, elasticity, low_Mach, viscous, shear_stress, bulk_stress, cont_damage)
+    $:DECLARE(create=["mpp_lim","model_eqns","mixture_err","alt_soundspeed"])
+    $:DECLARE(create=["avg_state","mp_weno","weno_eps","teno_CT","hypoelasticity"])
+    $:DECLARE(create=["hyperelasticity","hyper_model","elasticity","low_Mach"])
+    $:DECLARE(create=["viscous","shear_stress","bulk_stress","cont_damage"])
 
     logical :: relax          !< activate phase change
     integer :: relax_model    !< Relaxation model
     real(wp) :: palpha_eps     !< trigger parameter for the p relaxation procedure, phase change model
     real(wp) :: ptgalpha_eps   !< trigger parameter for the pTg relaxation procedure, phase change model
 
-!#ifndef _CRAYFTN
-!$acc declare create(relax, relax_model, palpha_eps,ptgalpha_eps)
-!#endif
+    $:DECLARE(create=["relax", "relax_model", "palpha_eps","ptgalpha_eps"])
 
     integer :: num_bc_patches
     logical :: bc_io
@@ -196,6 +200,10 @@ module m_global_parameters
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
     !> @}
+    $:DECLARE(create=["bc_x%vb1", "bc_x%vb2", "bc_x%vb3", "bc_x%ve1", "bc_x%ve2", "bc_x%ve3"])
+    $:DECLARE(create=["bc_y%vb1", "bc_y%vb2", "bc_y%vb3", "bc_y%ve1", "bc_y%ve2", "bc_y%ve3"])
+    $:DECLARE(create=["bc_z%vb1", "bc_z%vb2", "bc_z%vb3", "bc_z%ve1", "bc_z%ve2", "bc_z%ve3"])
+
     type(bounds_info) :: x_domain, y_domain, z_domain
     real(wp) :: x_a, y_a, z_a
     real(wp) :: x_b, y_b, z_b
@@ -248,19 +256,20 @@ module m_global_parameters
     integer :: c_idx                                   !< Index of color function
     integer :: damage_idx                              !< Index of damage state variable (D) for continuum damage model
     !> @}
-
-    !$acc declare create(bub_idx)
+    $:DECLARE(create=["sys_size","E_idx","n_idx","bub_idx","alf_idx","gamma_idx"])
+    $:DECLARE(create=["pi_inf_idx","B_idx","stress_idx","xi_idx","b_size"])
+    $:DECLARE(create=["tensor_size","species_idx","c_idx"])
 
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p).
     ! Stands for "InDices With INTerior".
     type(int_bounds_info) :: idwint(1:3)
-    !$acc declare create(idwint)
+    $:DECLARE(create=["idwint"])
 
     ! Cell Indices for the entire (local) domain. In simulation and post_process,
     ! this includes the buffer region. idwbuff and idwint are the same otherwise.
     ! Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
-    !$acc declare create(idwbuff)
+    $:DECLARE(create=["idwbuff"])
 
     !> @name The number of fluids, along with their identifying indexes, respectively,
     !! for which viscous effects, e.g. the shear and/or the volume Reynolds (Re)
@@ -270,7 +279,7 @@ module m_global_parameters
     integer, allocatable, dimension(:, :) :: Re_idx
     !> @}
 
-    !$acc declare create(Re_size, Re_idx)
+    $:DECLARE(create=["Re_size","Re_idx"])
 
     ! The WENO average (WA) flag regulates whether the calculation of any cell-
     ! average spatial derivatives is carried out in each cell by utilizing the
@@ -281,7 +290,7 @@ module m_global_parameters
     real(wp) :: wa_flg
     !> @{
 
-    !$acc declare create(wa_flg)
+    $:DECLARE(create=["wa_flg"])
 
     !> @name The coordinate direction indexes and flags (flg), respectively, for which
     !! the configurations will be determined with respect to a working direction
@@ -293,14 +302,15 @@ module m_global_parameters
     integer, dimension(3) :: dir_idx_tau !!used for hypoelasticity=true
     !> @}
 
-    !$acc declare create(dir_idx, dir_flg, dir_idx_tau)
+    $:DECLARE(create=["dir_idx","dir_flg","dir_idx_tau"])
 
     integer :: buff_size !<
     !! The number of cells that are necessary to be able to store enough boundary
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
-    !$acc declare create(sys_size, buff_size, E_idx, gamma_idx, pi_inf_idx, alf_idx, n_idx, stress_idx, b_size, tensor_size, xi_idx, species_idx, B_idx, c_idx)
+    $:DECLARE(create=["buff_size"])
+
 
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
@@ -311,7 +321,7 @@ module m_global_parameters
     !! Indices of shear stress components to reflect for boundary conditions.
     !! Size: (1:3, 1:shear_BC_flip_num) for (x/y/z, [indices])
 
-    !$acc declare create(shear_num, shear_indices, shear_BC_flip_num, shear_BC_flip_indices)
+    $:DECLARE(create=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
 
     ! END: Simulation Algorithm Parameters
 
@@ -322,10 +332,6 @@ module m_global_parameters
     !! in the flow. These include the stiffened gas equation of state parameters,
     !! the Reynolds numbers and the Weber numbers.
 
-    !$acc declare create(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-    !$acc declare create(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-    !$acc declare create(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
-
     integer :: fd_order !<
     !! The order of the finite-difference (fd) approximations of the first-order
     !! derivatives that need to be evaluated when the CoM or flow probe data
@@ -335,7 +341,7 @@ module m_global_parameters
     !! The finite-difference number is given by MAX(1, fd_order/2). Essentially,
     !! it is a measure of the half-size of the finite-difference stencil for the
     !! selected order of accuracy.
-    !$acc declare create(fd_order,fd_number)
+    $:DECLARE(create=["fd_order","fd_number"])
 
     logical :: probe_wrt
     logical :: integral_wrt
@@ -348,7 +354,7 @@ module m_global_parameters
     !> @{
     real(wp) :: rhoref, pref
     !> @}
-    !$acc declare create(rhoref, pref)
+    $:DECLARE(create=["rhoref","pref"])
 
     !> @name Immersed Boundaries
     !> @{
@@ -363,7 +369,7 @@ module m_global_parameters
     !! the maximum allowable number of patches, num_patches_max, may be changed
     !! in the module m_derived_types.f90.
 
-    !$acc declare create(ib, num_ibs, patch_ib)
+    $:DECLARE(create=["ib","num_ibs","patch_ib"])
     !> @}
 
     !> @name Bubble modeling
@@ -378,26 +384,31 @@ module m_global_parameters
     real(wp) :: Ca       !< Cavitation number
     real(wp) :: Web      !< Weber number
     real(wp) :: Re_inv   !< Inverse Reynolds number
+    $:DECLARE(create=["R0ref","Ca","Web","Re_inv"])
 
     real(wp), dimension(:), allocatable :: weight !< Simpson quadrature weights
     real(wp), dimension(:), allocatable :: R0     !< Bubble sizes
     real(wp), dimension(:), allocatable :: V0     !< Bubble velocities
-    !$acc declare create(weight, R0, V0)
+    $:DECLARE(create=["weight","R0","V0"])
 
     logical :: bubbles_euler      !< Bubbles euler on/off
     logical :: polytropic   !< Polytropic  switch
     logical :: polydisperse !< Polydisperse bubbles
+    $:DECLARE(create=["bubbles_euler","polytropic","polydisperse"])
+
     logical :: adv_n        !< Solve the number density equation and compute alpha from number density
     logical :: adap_dt      !< Adaptive step size control
     real(wp) :: adap_dt_tol !< Tolerance to control adaptive step size
+    $:DECLARE(create=["adv_n","adap_dt","adap_dt_tol"])
 
     integer :: bubble_model !< Gilmore or Keller--Miksis bubble model
     integer :: thermal      !< Thermal behavior. 1 = adiabatic, 2 = isotherm, 3 = transfer
+    $:DECLARE(create=["bubble_model","thermal"])
 
     real(wp), allocatable, dimension(:, :, :) :: ptil  !< Pressure modification
-    !$acc declare create(ptil)
 
     real(wp) :: poly_sigma  !< log normal sigma for polydisperse PDF
+    $:DECLARE(create=["ptil", "poly_sigma"])
 
     logical :: qbmm      !< Quadrature moment method
     integer, parameter :: nmom = 6 !< Number of carried moments per R0 location
@@ -406,38 +417,39 @@ module m_global_parameters
     integer :: R0_type
 
     real(wp) :: pi_fac   !< Factor for artificial pi_inf
+    $:DECLARE(create=["qbmm", "nmomsp","nmomtot","R0_type","pi_fac"])
 
     #:if not MFC_CASE_OPTIMIZATION
-        !$acc declare create(nb)
+        $:DECLARE(create=["nb"])
     #:endif
 
-    !$acc declare create(R0ref, Ca, Web, Re_inv, bubbles_euler, polytropic, polydisperse, qbmm, nmomsp, nmomtot, R0_type, bubble_model, thermal, poly_sigma, adv_n, adap_dt, adap_dt_tol, pi_fac)
-
     type(scalar_field), allocatable, dimension(:) :: mom_sp
     type(scalar_field), allocatable, dimension(:, :, :) :: mom_3d
-    !$acc declare create(mom_sp, mom_3d)
+    $:DECLARE(create=["mom_sp","mom_3d"])
 
     !> @}
 
     type(chemistry_parameters) :: chem_params
-    !$acc declare create(chem_params)
+    $:DECLARE(create=["chem_params"])
 
     !> @name Physical bubble parameters (see Ando 2010, Preston 2007)
     !> @{
 
     real(wp) :: R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v
-    !$acc declare create(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v)
+    $:DECLARE(create=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw",])
+    $:DECLARE(create=["pv","M_n", "M_v","k_vl","k_nl","cp_n","cp_v"])
 
     real(wp), dimension(:), allocatable :: k_n, k_v, pb0, mass_n0, mass_v0, Pe_T
     real(wp), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN
-    !$acc declare create( k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN)
+    $:DECLARE(create=["k_n","k_v","pb0","mass_n0","mass_v0","Pe_T"])
+    $:DECLARE(create=["Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN"])
 
     real(wp) :: mul0, ss, gamma_v, mu_v
     real(wp) :: gamma_m, gamma_n, mu_n
     real(wp) :: gam
     !> @}
 
-    !$acc declare create(mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
+    $:DECLARE(create=["mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
 
     !> @name Acoustic acoustic_source parameters
     !> @{
@@ -445,14 +457,14 @@ module m_global_parameters
     type(acoustic_parameters), dimension(num_probes_max) :: acoustic !< Acoustic source parameters
     integer :: num_source !< Number of acoustic sources
     !> @}
-    !$acc declare create(acoustic_source, acoustic, num_source)
+    $:DECLARE(create=["acoustic_source","acoustic","num_source"])
 
     !> @name Surface tension parameters
     !> @{
 
     real(wp) :: sigma
     logical :: surface_tension
-    !$acc declare create(sigma, surface_tension)
+    $:DECLARE(create=["sigma","surface_tension"])
     !> @}
 
     integer :: momxb, momxe
@@ -463,11 +475,13 @@ module m_global_parameters
     integer :: strxb, strxe
     integer :: chemxb, chemxe
     integer :: xibeg, xiend
-    !$acc declare create(momxb, momxe, advxb, advxe, contxb, contxe, intxb, intxe, bubxb, bubxe, strxb, strxe, chemxb, chemxe)
-    !$acc declare create(xibeg,xiend)
+    $:DECLARE(create=["momxb","momxe","advxb","advxe","contxb","contxe"])
+    $:DECLARE(create=["intxb","intxe", "bubxb","bubxe"])
+    $:DECLARE(create=["strxb","strxe","chemxb","chemxe"])
+    $:DECLARE(create=["xibeg","xiend"])
 
     real(wp), allocatable, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps)
+    $:DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
 
     real(wp) :: mytime       !< Current simulation time
     real(wp) :: finaltime    !< Final simulation time
@@ -478,25 +492,25 @@ module m_global_parameters
 
     type(pres_field), allocatable, dimension(:) :: mv_ts
 
-    !$acc declare create(pb_ts, mv_ts)
+    $:DECLARE(create=["pb_ts","mv_ts"])
 
     !> @name lagrangian subgrid bubble parameters
     !> @{!
     logical :: bubbles_lagrange                         !< Lagrangian subgrid bubble model switch
     type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
-    !$acc declare create(bubbles_lagrange, lag_params)
+    $:DECLARE(create=["bubbles_lagrange","lag_params"])
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
     logical :: powell !< Powell‐correction for div B = 0
-    !$acc declare create(Bx0, powell)
+    $:DECLARE(create=["Bx0","powell"])
 
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star        !< Stress threshold for continuum damage modeling
     real(wp) :: cont_damage_s   !< Exponent s for continuum damage modeling
     real(wp) :: alpha_bar       !< Damage rate factor for continuum damage modeling
-    !$acc declare create(tau_star, cont_damage_s, alpha_bar)
+    $:DECLARE(create=["tau_star","cont_damage_s","alpha_bar"])
     !> @}
 
 contains
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 43581328b7..11091627e8 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -27,14 +27,14 @@ module m_hyperelastic
     !! The btensor at the cell-interior Gaussian quadrature points.
     !! These tensor is needed to be calculated once and make the code DRY.
     type(vector_field) :: btensor !<
-    !$acc declare create(btensor)
+    $:DECLARE(create=["btensor"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z
-    !$acc declare create(fd_coeff_x,fd_coeff_y,fd_coeff_z)
+    $:DECLARE(create=["fd_coeff_x","fd_coeff_y", "fd_coeff_z"])
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:DECLARE(create=["Gs"])
 
 contains
 
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 4c1a101e49..430b69d5e5 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -21,20 +21,20 @@ module m_hypoelastic
  s_compute_damage_state
 
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:DECLARE(create=["Gs"])
 
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    !$acc declare create(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz)
+    $:DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
 
     real(wp), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field
-    !$acc declare create(rho_K_field, G_K_field)
+    $:DECLARE(create=["rho_K_field","G_K_field"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    !$acc declare create(fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h)
+    $:DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
 
 contains
 
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index d118a2fa55..a3538b2a7e 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -36,15 +36,15 @@ module m_ibm
     type(integer_field), public :: ib_markers
     type(levelset_field), public :: levelset
     type(levelset_norm_field), public :: levelset_norm
-    !$acc declare create(ib_markers, levelset, levelset_norm)
+    $:DECLARE(create=["ib_markers","levelset","levelset_norm"])
 
     type(ghost_point), dimension(:), allocatable :: ghost_points
     type(ghost_point), dimension(:), allocatable :: inner_points
-    !$acc declare create(ghost_points, inner_points)
+    $:DECLARE(create=["ghost_points","inner_points"])
 
     integer :: num_gps !< Number of ghost points
     integer :: num_inner_gps !< Number of ghost points
-    !$acc declare create(gp_layers, num_gps, num_inner_gps)
+    $:DECLARE(create=["gp_layers","num_gps","num_inner_gps"])
 
 contains
 
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index a096a4fe04..a32412398c 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -25,12 +25,12 @@ module m_mhd
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    !$acc declare create(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz)
+    $:DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    !$acc declare create(fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h)
+    $:DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
 
 contains
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f931227f4d..f7c05cd07f 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -4,6 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
@@ -46,7 +47,7 @@ module m_mpi_proxy
     !> @{
     integer, private :: err_code, ierr, v_size
     !> @}
-    !$acc declare create(v_size)
+    $:DECLARE(create=["v_size"])
 
 contains
 
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 075334a2d0..863324b71d 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -26,21 +26,21 @@ module m_qbmm
     private; public :: s_initialize_qbmm_module, s_mom_inv, s_coeff, s_compute_qbmm_rhs
 
     real(wp), allocatable, dimension(:, :, :, :, :) :: momrhs
-    !$acc declare create(momrhs)
+    $:DECLARE(create=["momrhs"])
 
     #:if MFC_CASE_OPTIMIZATION
         integer, parameter :: nterms = ${nterms}$
     #:else
         integer :: nterms
-        !$acc declare create(nterms)
+        $:DECLARE(create=["nterms"])
     #:endif
 
     type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm
-    !$acc declare create(is1_qbmm, is2_qbmm, is3_qbmm)
+    $:DECLARE(create=["is1_qbmm","is2_qbmm","is3_qbmm"])
 
     integer, allocatable, dimension(:) :: bubrs
     integer, allocatable, dimension(:, :) :: bubmoms
-    !$acc declare create(bubrs, bubmoms)
+    $:DECLARE(create=["bubrs","bubmoms"])
 
 contains
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 9f96010e56..54899dc0fa 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -75,13 +75,13 @@ module m_rhs
     !! conservative variables, which are located in q_cons_vf, at cell-interior
     !! Gaussian quadrature points (QP).
     type(vector_field) :: q_cons_qp !<
-    !$acc declare create(q_cons_qp)
+    $:DECLARE(create=["q_cons_qp"])
 
     !! The primitive variables at cell-interior Gaussian quadrature points. These
     !! are calculated from the conservative variables and gradient magnitude (GM)
     !! of the volume fractions, q_cons_qp and gm_alpha_qp, respectively.
     type(vector_field) :: q_prim_qp !<
-    !$acc declare create(q_prim_qp)
+    $:DECLARE(create=["q_prim_qp"])
 
     !> @name The first-order spatial derivatives of the primitive variables at cell-
     !! interior Gaussian quadrature points. These are WENO-reconstructed from
@@ -90,7 +90,7 @@ module m_rhs
     !! of the primitive variables, located in qK_prim_n, where K = L or R.
     !> @{
     type(vector_field), allocatable, dimension(:) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
-    !$acc declare create(dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp)
+    $:DECLARE(create=["dq_prim_dx_qp","dq_prim_dy_qp","dq_prim_dz_qp"])
     !> @}
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
@@ -100,26 +100,26 @@ module m_rhs
     !> @{
     type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n
     type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n
-    !$acc declare create(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n)
-    !$acc declare create(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n)
+    $:DECLARE(create=["dqL_prim_dx_n","dqL_prim_dy_n","dqL_prim_dz_n"])
+    $:DECLARE(create=["dqR_prim_dx_n","dqR_prim_dy_n","dqR_prim_dz_n"])
     !> @}
 
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
-    !$acc declare create(tau_Re_vf)
+    $:DECLARE(create=["tau_Re_vf"])
 
     type(vector_field) :: gm_alpha_qp  !<
     !! The gradient magnitude of the volume fractions at cell-interior Gaussian
     !! quadrature points. gm_alpha_qp is calculated from individual first-order
     !! spatial derivatives located in dq_prim_ds_qp.
 
-    !$acc declare create(gm_alpha_qp)
+    $:DECLARE(create=["gm_alpha_qp"])
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
     !! average gradient magnitude of volume fractions, located in gm_alpha_qp.
     !> @{
     type(vector_field), allocatable, dimension(:) :: gm_alphaL_n
     type(vector_field), allocatable, dimension(:) :: gm_alphaR_n
-    !$acc declare create(gm_alphaL_n, gm_alphaR_n)
+    $:DECLARE(create=["gm_alphaL_n","gm_alphaR_n"])
     !> @}
 
     !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical
@@ -129,44 +129,44 @@ module m_rhs
     type(vector_field), allocatable, dimension(:) :: flux_n
     type(vector_field), allocatable, dimension(:) :: flux_src_n
     type(vector_field), allocatable, dimension(:) :: flux_gsrc_n
-    !$acc declare create(flux_n, flux_src_n, flux_gsrc_n)
+    $:DECLARE(create=["flux_n","flux_src_n","flux_gsrc_n"])
     !> @}
 
     type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim
-    !$acc declare create(qL_prim, qR_prim)
+    $:DECLARE(create=["qL_prim","qR_prim"])
 
     type(int_bounds_info) :: iv !< Vector field indical bounds
-    !$acc declare create(iv)
+    $:DECLARE(create=["iv"])
 
     !> @name Indical bounds in the x-, y- and z-directions
     !> @{
     type(int_bounds_info) :: irx, iry, irz
-    !$acc declare create(irx, iry, irz)
+    $:DECLARE(create=["irx","iry","irz"])
 
     type(int_bounds_info) :: is1, is2, is3
-    !$acc declare create(is1, is2, is3)
+    $:DECLARE(create=["is1","is2","is3"])
 
     !> @name Saved fluxes for testing
     !> @{
     type(scalar_field) :: alf_sum
     !> @}
-    !$acc declare create(alf_sum)
+    $:DECLARE(create=["alf_sum"])
 
     real(wp), allocatable, dimension(:, :, :) :: blkmod1, blkmod2, alpha1, alpha2, Kterm
     real(wp), allocatable, dimension(:, :, :, :) :: qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf
     real(wp), allocatable, dimension(:, :, :, :) :: dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf
-    !$acc declare create(blkmod1, blkmod2, alpha1, alpha2, Kterm)
-    !$acc declare create(qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf)
-    !$acc declare create(dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf)
+    $:DECLARE(create=["blkmod1","blkmod2","alpha1","alpha2","Kterm"])
+    $:DECLARE(create=["qL_rsx_vf","qL_rsy_vf","qL_rsz_vf","qR_rsx_vf","qR_rsy_vf","qR_rsz_vf"])
+    $:DECLARE(create=["dqL_rsx_vf","dqL_rsy_vf","dqL_rsz_vf","dqR_rsx_vf","dqR_rsy_vf","dqR_rsz_vf"])
 
     real(wp), allocatable, dimension(:) :: gamma_min, pres_inf
-    !$acc declare create(gamma_min, pres_inf)
+    $:DECLARE(create=["gamma_min","pres_inf"])
 
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(Res)
+    $:DECLARE(create=["Res"])
 
     real(wp), allocatable, dimension(:, :, :) :: nbub !< Bubble number density
-    !$acc declare create(nbub)
+    $:DECLARE(create=["nbub"])
 
 contains
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index a6d1c80a77..a593a36c71 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -66,8 +66,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
-    !$acc declare create( flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf,  &
-    !$acc   flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf )
+    $:DECLARE(create=["flux_rsx_vf","flux_src_rsx_vf","flux_rsy_vf","flux_src_rsy_vf","flux_rsz_vf","flux_src_rsz_vf"])
     !> @}
 
     !> The cell-boundary values of the geometrical source flux that are computed
@@ -78,7 +77,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
-    !$acc declare create( flux_gsrc_rsx_vf, flux_gsrc_rsy_vf, flux_gsrc_rsz_vf )
+    $:DECLARE(create=["flux_gsrc_rsx_vf","flux_gsrc_rsy_vf","flux_gsrc_rsz_vf"])
     !> @}
 
     ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
@@ -87,17 +86,17 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
-    !$acc declare create(vel_src_rsx_vf, vel_src_rsy_vf, vel_src_rsz_vf)
+    $:DECLARE(create=["vel_src_rsx_vf","vel_src_rsy_vf","vel_src_rsz_vf"])
 
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
-    !$acc declare create(mom_sp_rsx_vf, mom_sp_rsy_vf, mom_sp_rsz_vf)
+    $:DECLARE(create=["mom_sp_rsx_vf","mom_sp_rsy_vf","mom_sp_rsz_vf"])
 
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
-    !$acc declare create(Re_avg_rsx_vf, Re_avg_rsy_vf, Re_avg_rsz_vf)
+    $:DECLARE(create=["Re_avg_rsx_vf","Re_avg_rsy_vf","Re_avg_rsz_vf"])
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
@@ -105,13 +104,13 @@ module m_riemann_solvers
     type(int_bounds_info) :: isx, isy, isz
     !> @}
 
-    !$acc declare create(is1, is2, is3, isx, isy, isz)
+    $:DECLARE(create=["is1","is2","is3","isx","isy","isz"])
 
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:DECLARE(create=["Gs"])
 
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(Res)
+    $:DECLARE(create=["Res"])
 
 contains
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index a3be874f86..4ab6c23a7c 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -30,16 +30,16 @@ module m_surface_tension
     !> @{
     type(scalar_field), allocatable, dimension(:) :: c_divs
     !> @)
-    !$acc declare create(c_divs)
+    $:DECLARE(create=["c_divs"])
 
     !> @name cell boundary reconstructed gradient components and magnitude
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: gL_x, gR_x, gL_y, gR_y, gL_z, gR_z
     !> @}
-    !$acc declare create(gL_x, gR_x, gL_y, gR_y, gL_z, gR_z)
+    $:DECLARE(create=["gL_x","gR_x","gL_y","gR_y","gL_z","gR_z"])
 
     type(int_bounds_info) :: is1, is2, is3, iv
-    !$acc declare create(is1, is2, is3, iv)
+    $:DECLARE(create=["is1","is2","is3","iv"])
 
 contains
 
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 7c1c986ac9..b19600c6fd 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -76,7 +76,7 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, q_prim_ts, rhs_mv, rhs_pb, max_dt)
+    $:DECLARE(create=["q_cons_ts","q_prim_vf","q_T_sf","rhs_vf","q_prim_ts","rhs_mv","rhs_pb","max_dt"])
 
 contains
 
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 84eb8a4329..323444c1ee 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -25,10 +25,10 @@ module m_viscous
 
     type(int_bounds_info) :: iv
     type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
-    !$acc declare create(is1_viscous, is2_viscous, is3_viscous, iv)
+    $:DECLARE(create=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
     real(wp), allocatable, dimension(:, :) :: Res_viscous
-    !$acc declare create(Res_viscous)
+    $:DECLARE(create=["Res_viscous"])
 
 contains
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 830c852c83..9c9778a405 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -43,6 +43,7 @@ module m_weno
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: v_rs_ws_x, v_rs_ws_y, v_rs_ws_z
     !> @}
+    $:DECLARE(create=["v_rs_ws_x","v_rs_ws_y","v_rs_ws_z"])
 
     ! WENO Coefficients
 
@@ -59,6 +60,8 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_y
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_z
     !> @}
+    $:DECLARE(create=["poly_coef_cbL_x","poly_coef_cbL_y","poly_coef_cbL_z"])
+    $:DECLARE(create=["poly_coef_cbR_x","poly_coef_cbR_y","poly_coef_cbR_z"])
 
     !> @name The ideal weights at the left and the right cell-boundaries and at the
     !! left and the right quadrature points, in x-, y- and z-directions. Note
@@ -73,6 +76,7 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_y
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_z
     !> @}
+    $:DECLARE(create=["d_cbL_x","d_cbL_y","d_cbL_z","d_cbR_x","d_cbR_y","d_cbR_z"])
 
     !> @name Smoothness indicator coefficients in the x-, y-, and z-directions. Note
     !! that the first array dimension identifies the smoothness indicator, the
@@ -83,27 +87,22 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_y
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_z
     !> @}
+    $:DECLARE(create=["beta_coef_x","beta_coef_y","beta_coef_z"])
 
     ! END: WENO Coefficients
 
     integer :: v_size !< Number of WENO-reconstructed cell-average variables
-    !$acc declare create(v_size)
+    $:DECLARE(create=["v_size"])
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
-    !$acc declare create(is1_weno, is2_weno, is3_weno)
+    $:DECLARE(create=["is1_weno","is2_weno","is3_weno"])
     !
     !> @}
 
     real(wp) :: test
-    !$acc declare create(test)
-
-    !$acc declare create( &
-    !$acc                v_rs_ws_x, v_rs_ws_y, v_rs_ws_z, &
-    !$acc                poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z, &
-    !$acc                poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z,d_cbL_x,       &
-    !$acc                d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z,beta_coef_x,beta_coef_y,beta_coef_z)
+    $:DECLARE(create=["test"])
 
 contains
 

From bb0d2ed432753d1652a1f4094e58ae409c117263 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 13:42:59 -0400
Subject: [PATCH 19/75] Added update device macros

---
 src/common/m_mpi_common.fpp              |  8 ++--
 src/common/m_variables_conversion.fpp    |  8 ++--
 src/simulation/m_acoustic_src.fpp        | 15 ++++---
 src/simulation/m_body_forces.fpp         |  2 +-
 src/simulation/m_boundary_conditions.fpp |  9 +++--
 src/simulation/m_bubbles_EE.fpp          |  4 +-
 src/simulation/m_bubbles_EL.fpp          | 10 ++---
 src/simulation/m_cbc.fpp                 | 24 ++++++-----
 src/simulation/m_fftw.fpp                |  6 +--
 src/simulation/m_global_parameters.fpp   | 51 ++++++++++++++----------
 src/simulation/m_hyperelastic.fpp        |  8 ++--
 src/simulation/m_hypoelastic.fpp         |  8 ++--
 src/simulation/m_ibm.fpp                 | 14 +++----
 src/simulation/m_mhd.fpp                 |  6 +--
 src/simulation/m_mpi_proxy.fpp           | 12 +++---
 src/simulation/m_qbmm.fpp                | 10 ++---
 src/simulation/m_rhs.fpp                 |  8 ++--
 src/simulation/m_riemann_solvers.fpp     | 12 +++---
 src/simulation/m_sim_helpers.fpp         |  6 +--
 src/simulation/m_start_up.fpp            | 45 ++++++++++++---------
 src/simulation/m_surface_tension.fpp     |  2 +-
 src/simulation/m_time_steppers.fpp       |  2 +-
 src/simulation/m_viscous.fpp             | 18 ++++-----
 src/simulation/m_weno.fpp                | 10 ++---
 24 files changed, 162 insertions(+), 136 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 1706b848c2..c580281ef2 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -643,7 +643,7 @@ contains
 #ifdef MFC_MPI
 
         call nvtxStartRange("RHS-COMM-PACKBUF")
-!$acc update device(v_size)
+$:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
         if (qbmm .and. .not. polytropic) then
@@ -878,7 +878,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    !$acc update device(buff_recv)
+                    $:UPDATE(device=["buff_recv"])
                     call nvtxEndRange
                 #:endif
             end if
@@ -1093,7 +1093,7 @@ contains
 #ifdef MFC_MPI
 
         nVars = num_dims + 1
-        !$acc update device(nVars)
+        $:UPDATE(device=["nVars"])
 
         buffer_counts = (/ &
                         buff_size*nVars*(n + 1)*(p + 1), &
@@ -1208,7 +1208,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    !$acc update device(buff_recv)
+                    $:UPDATE(device=["buff_recv"])
                     call nvtxEndRange
                 #:endif
             end if
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index d75d69528b..7c49767723 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -653,7 +653,7 @@ contains
             qvs(i) = fluid_pp(i)%qv
             qvps(i) = fluid_pp(i)%qvp
         end do
-!$acc update device(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs)
+$:UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
 
 #ifdef MFC_SIMULATION
 
@@ -665,7 +665,7 @@ contains
                 end do
             end do
 
-            !$acc update device(Res, Re_idx, Re_size)
+            $:UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 #endif
 
@@ -679,7 +679,7 @@ contains
             do i = 1, nb
                 bubrs(i) = bub_idx%rs(i)
             end do
-            !$acc update device(bubrs)
+            $:UPDATE(device=["bubrs"])
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -1476,7 +1476,7 @@ contains
         is2b = is2%beg; is2e = is2%end
         is3b = is3%beg; is3e = is3%end
 
-        !$acc update device(is1b, is2b, is3b, is1e, is2e, is3e)
+        $:UPDATE(device=["is1b","is2b","is3b","is1e","is2e","is3e"])
 
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 0a03a1b7f9..a779c0e5d5 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -111,7 +111,12 @@ contains
                 delay(i) = acoustic(i)%delay
             end if
         end do
-        !$acc update device(loc_acoustic, mag, dipole, support, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, foc_length, aperture, npulse, pulse, dir, delay, element_polygon_ratio, rotate_angle, element_spacing_angle, num_elements, element_on, bb_num_freq, bb_bandwidth, bb_lowest_freq)
+        $:UPDATE(device=["loc_acoustic","mag","dipole","support","length", &
+            & "height","wavelength","frequency","gauss_sigma_dist", &
+            & "gauss_sigma_time","foc_length","aperture","npulse","pulse", &
+            & "dir","delay","element_polygon_ratio","rotate_angle", &
+            & "element_spacing_angle","num_elements","element_on", &
+            & "bb_num_freq","bb_bandwidth","bb_lowest_freq"])
 
         @:ALLOCATE(mass_src(0:m, 0:n, 0:p))
         @:ALLOCATE(mom_src(1:num_vels, 0:m, 0:n, 0:p))
@@ -469,14 +474,14 @@ contains
                 call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
             end if
 
-            !$acc update device(source_spatials(ai)%coord)
-            !$acc update device(source_spatials(ai)%val)
+            $:UPDATE(device=["source_spatials(ai)%coord"])
+            $:UPDATE(device=["source_spatials(ai)%val"])
             if (support(ai) >= 5) then
                 if (dim == 2) then
-                    !$acc update device(source_spatials(ai)%angle)
+                    $:UPDATE(device=["source_spatials(ai)%angle"])
                 end if
                 if (dim == 3) then
-                    !$acc update device(source_spatials(ai)%xyz_to_r_ratios)
+                    $:UPDATE(device=["source_spatials(ai)%xyz_to_r_ratios"])
                 end if
             end if
 
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 17e700fcfe..b0d7cda444 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -68,7 +68,7 @@ contains
             end if
         end if
 
-        !$acc update device(accel_bf)
+        $:UPDATE(device=["accel_bf"])
 
     end subroutine s_compute_acceleration
 
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index d0666e2ca7..ad50b559ac 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -2,6 +2,7 @@
 ! @brief Contains module m_boundary_conditions
 
 !> @brief This module contains
+#:include 'directive_macros.fpp'
 module m_boundary_conditions
 
     use m_derived_types
@@ -39,7 +40,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_type(dir, loc)%sf
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:UPDATE(device=["bc_type(dir, loc)%sf"])
             end do
         end do
         close (1)
@@ -55,7 +56,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_buffers(dir, loc)%sf
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:UPDATE(device=["bc_buffers(dir, loc)%sf"])
             end do
         end do
         close (1)
@@ -103,7 +104,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), MPI_INTEGER, MPI_BC_TYPE_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_type(dir, loc)%sf, 1, MPI_BC_TYPE_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_type(dir, loc)%sf)
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:UPDATE(device=["bc_type(dir, loc)%sf"])
             end do
         end do
 
@@ -113,7 +114,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_p, MPI_BC_BUFFER_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_buffers(dir, loc)%sf, 1, MPI_BC_BUFFER_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_buffers(dir, loc)%sf)
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:UPDATE(device=["bc_buffers(dir, loc)%sf"])
             end do
         end do
 
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index fd770ebcf1..714f95913c 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -52,9 +52,9 @@ contains
             end if
         end do
 
-        !$acc update device(rs, vs)
+        $:UPDATE(device=["rs", "vs"])
         if (.not. polytropic) then
-            !$acc update device(ps, ms)
+            $:UPDATE(device=["ps", "ms"])
         end if
 
         @:ALLOCATE(divu%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index af9aa96436..7f9d63c994 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -101,7 +101,7 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
-        !$acc update device(lag_num_ts, q_beta_idx)
+        $:UPDATE(device=["lag_num_ts", "q_beta_idx"])
 
         @:ALLOCATE(q_beta%vf(1:q_beta_idx))
 
@@ -249,7 +249,7 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
-        !$acc update device(bubbles_lagrange, lag_params)
+        $:UPDATE(device=["bubbles_lagrange", "lag_params"])
 
         !$acc update device(lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC,   &
         !$acc bub_dphidt, gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel,       &
@@ -257,9 +257,9 @@ contains
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
-        !$acc update device(Rmax_glb, Rmin_glb)
+        $:UPDATE(device=["Rmax_glb", "Rmin_glb"])
 
-        !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc)
+        $:UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
 
         !Populate temporal variables
         call s_transfer_data_to_tmp()
@@ -1676,7 +1676,7 @@ contains
         end do
 
         nBubs = nBubs - 1
-        !$acc update device(nBubs)
+        $:UPDATE(device=["nBubs"])
 
     end subroutine s_remove_lag_bubble
 
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 28fe01bea8..e306717513 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -139,7 +139,7 @@ contains
         else
             flux_cbc_index = adv_idx%end
         end if
-        !$acc update device(flux_cbc_index)
+        $:UPDATE(device=["flux_cbc_index"])
 
         call s_any_cbc_boundaries(is_cbc)
 
@@ -389,7 +389,8 @@ contains
 
         end if
 
-        !$acc update device(fd_coef_x, fd_coef_y, fd_coef_z, pi_coef_x, pi_coef_y, pi_coef_z)
+        $:UPDATE(device=["fd_coef_x","fd_coef_y","fd_coef_z", &
+            & "pi_coef_x","pi_coef_y","pi_coef_z"])
 
         ! Associating the procedural pointer to the appropriate subroutine
         ! that will be utilized in the conversion to the mixture variables
@@ -397,20 +398,20 @@ contains
         bcxb = bc_x%beg
         bcxe = bc_x%end
 
-        !$acc update device(bcxb, bcxe)
+        $:UPDATE(device=["bcxb", "bcxe"])
 
         if (n > 0) then
             bcyb = bc_y%beg
             bcye = bc_y%end
 
-            !$acc update device(bcyb, bcye)
+            $:UPDATE(device=["bcyb", "bcye"])
         end if
 
         if (p > 0) then
             bczb = bc_z%beg
             bcze = bc_z%end
 
-            !$acc update device(bczb, bcze)
+            $:UPDATE(device=["bczb", "bcze"])
         end if
 
         ! Allocate GRCBC inputs
@@ -442,7 +443,8 @@ contains
                 end do
             end if
         #:endfor
-        !$acc update device(vel_in, vel_out, pres_in, pres_out, Del_in, Del_out, alpha_rho_in, alpha_in)
+        $:UPDATE(device=["vel_in","vel_out","pres_in","pres_out", &
+            & "Del_in","Del_out","alpha_rho_in","alpha_in"])
 
     end subroutine s_initialize_cbc_module
 
@@ -606,7 +608,7 @@ contains
 
         end if
 
-        !$acc update device(ds)
+        $:UPDATE(device=["ds"])
 
     end subroutine s_associate_cbc_coefficients_pointers
 
@@ -682,7 +684,7 @@ contains
         cbc_dir = cbc_dir_norm
         cbc_loc = cbc_loc_norm
 
-        !$acc update device(cbc_dir, cbc_loc)
+        $:UPDATE(device=["cbc_dir", "cbc_loc"])
 
         call s_initialize_cbc(q_prim_vf, flux_vf, flux_src_vf, &
                               ix, iy, iz)
@@ -1163,8 +1165,8 @@ contains
         end if
 
         dj = max(0, cbc_loc)
-        !$acc update device(is1, is2, is3, dj)
-        !$acc update device( dir_idx, dir_flg)
+        $:UPDATE(device=["is1","is2","is3","dj"])
+        $:UPDATE(device=["dir_idx","dir_flg"])
 
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
@@ -1420,7 +1422,7 @@ contains
 
         ! Determining the indicial shift based on CBC location
         dj = max(0, cbc_loc)
-        !$acc update device(dj)
+        $:UPDATE(device=["dj"])
 
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index c0cdc226ea..c8170d39a1 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -91,7 +91,7 @@ contains
         iembed(1) = 0
         oembed(1) = 0
         !$acc enter data copyin(real_size, cmplx_size, x_size, sys_size, batch_size, Nfq)
-        !$acc update device(real_size, cmplx_size, x_size, sys_size, batch_size)
+        $:UPDATE(device=["real_size","cmplx_size","x_size","sys_size","batch_size"])
 #else
         ! Allocate input and output DFT data sizes
         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
@@ -172,7 +172,7 @@ contains
 #endif
         !$acc end host_data
         Nfq = 3
-        !$acc update device(Nfq)
+        $:UPDATE(device=["Nfq"])
 
         $:PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
@@ -232,7 +232,7 @@ contains
             !$acc end host_data
 
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            !$acc update device(Nfq)
+            $:UPDATE(device=["Nfq"])
 
             $:PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 348f9d542c..b93f023d05 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -803,10 +803,10 @@ contains
             else
                 weno_num_stencils = weno_polyn
             end if
-            !$acc update device(weno_polyn)
-            !$acc update device(weno_num_stencils)
-            !$acc update device(nb)
-            !$acc update device(num_dims, num_vels, num_fluids)
+            $:UPDATE(device=["weno_polyn"])
+            $:UPDATE(device=["weno_num_stencils"])
+            $:UPDATE(device=["nb"])
+            $:UPDATE(device=["num_dims","num_vels","num_fluids"])
         #:endif
 
         ! Initializing the number of fluids for which viscous effects will
@@ -1042,7 +1042,7 @@ contains
             if (Re_size(1) > 0._wp) shear_stress = .true.
             if (Re_size(2) > 0._wp) bulk_stress = .true.
 
-            !$acc update device(Re_size, viscous, shear_stress, bulk_stress)
+            $:UPDATE(device=["Re_size","viscous","shear_stress","bulk_stress"])
 
             ! Bookkeeping the indexes of any viscous fluids and any pairs of
             ! fluids whose interface will support effects of surface tension
@@ -1098,7 +1098,7 @@ contains
                     ! y-dir: flip tau_xy and tau_yz
                     ! z-dir: flip tau_xz and tau_yz
                 end if
-                !$acc update device(shear_num, shear_indices, shear_BC_flip_num, shear_BC_flip_indices)
+                $:UPDATE(device=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
             end if
 
             if (hyperelasticity) then
@@ -1165,7 +1165,7 @@ contains
         ! cell-boundary values or otherwise, the unaltered left and right,
         ! WENO-reconstructed, cell-boundary values
         wa_flg = 0._wp; if (weno_avg) wa_flg = 1._wp
-        !$acc update device(wa_flg)
+        $:UPDATE(device=["wa_flg"])
 
         ! Resort to default WENO-JS if no other WENO scheme is selected
         #:if not MFC_CASE_OPTIMIZATION
@@ -1175,7 +1175,7 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
         Np = 0
 
-        !$acc update device(Re_size)
+        $:UPDATE(device=["Re_size"])
 
         if (elasticity) then
             fd_number = max(1, fd_order/2)
@@ -1193,7 +1193,7 @@ contains
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
                                            num_dims)
-        !$acc update device(idwint, idwbuff)
+        $:UPDATE(device=["idwint", "idwbuff"])
 
         ! Configuring Coordinate Direction Indexes
         if (bubbles_euler) then
@@ -1203,7 +1203,7 @@ contains
                 & idwbuff(3)%beg:idwbuff(3)%end))
         end if
 
-        !$acc update device(fd_order,fd_number)
+        $:UPDATE(device=["fd_order", "fd_number"])
 
         if (cyl_coord .neqv. .true.) then ! Cartesian grid
             grid_geometry = 1
@@ -1230,23 +1230,32 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
-        !$acc update device(momxb, momxe, advxb, advxe, contxb, contxe, bubxb, bubxe, intxb, intxe, sys_size, buff_size, E_idx, alf_idx, n_idx, adv_n, adap_dt, pi_fac, strxb, strxe, chemxb, chemxe, c_idx)
-        !$acc update device(b_size, xibeg, xiend, tensor_size)
+        $:UPDATE(device=["momxb","momxe","advxb","advxe","contxb","contxe", &
+            & "bubxb","bubxe","intxb","intxe","sys_size","buff_size","E_idx", &
+            & "alf_idx","n_idx","adv_n","adap_dt","pi_fac","strxb","strxe", &
+            & "chemxb","chemxe","c_idx"])
+        $:UPDATE(device=["b_size","xibeg","xiend","tensor_size"])
 
-        !$acc update device(species_idx)
-        !$acc update device(cfl_target, m, n, p)
+        $:UPDATE(device=["species_idx"])
+        $:UPDATE(device=["cfl_target","m","n","p"])
 
-        !$acc update device(alt_soundspeed, acoustic_source, num_source)
-        !$acc update device(dt, sys_size, buff_size, pref, rhoref, gamma_idx, pi_inf_idx, E_idx, alf_idx, stress_idx, mpp_lim, bubbles_euler, hypoelasticity, alt_soundspeed, avg_state, num_fluids, model_eqns, num_dims, num_vels, mixture_err, grid_geometry, cyl_coord, mp_weno, weno_eps, teno_CT, hyperelasticity, hyper_model, elasticity, xi_idx, B_idx, low_Mach)
+        $:UPDATE(device=["alt_soundspeed","acoustic_source","num_source"])
+        $:UPDATE(device=["dt","sys_size","buff_size","pref","rhoref", &
+            & "gamma_idx","pi_inf_idx","E_idx","alf_idx","stress_idx", &
+            & "mpp_lim","bubbles_euler","hypoelasticity","alt_soundspeed", &
+            & "avg_state","num_fluids","model_eqns","num_dims","num_vels", &
+            & "mixture_err","grid_geometry","cyl_coord","mp_weno","weno_eps", &
+            & "teno_CT","hyperelasticity","hyper_model","elasticity","xi_idx", &
+            & "B_idx","low_Mach"])
 
-        !$acc update device(Bx0, powell)
+        $:UPDATE(device=["Bx0", "powell"])
 
-        !$acc update device(cont_damage, tau_star, cont_damage_s, alpha_bar)
+        $:UPDATE(device=["cont_damage","tau_star","cont_damage_s","alpha_bar"])
 
         #:if not MFC_CASE_OPTIMIZATION
-            !$acc update device(wenojs, mapped_weno, wenoz, teno)
-            !$acc update device(wenoz_q)
-            !$acc update device(mhd, relativity)
+            $:UPDATE(device=["wenojs","mapped_weno","wenoz","teno"])
+            $:UPDATE(device=["wenoz_q"])
+            $:UPDATE(device=["mhd", "relativity"])
         #:endif
 
         !$acc enter data copyin(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma)
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 11091627e8..efe984ef05 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -60,7 +60,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:UPDATE(device=["Gs"])
 
         @:ALLOCATE(fd_coeff_x(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -73,16 +73,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, &
                                                       fd_number, fd_order)
-        !$acc update device(fd_coeff_x)
+        $:UPDATE(device=["fd_coeff_x"])
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_y)
+            $:UPDATE(device=["fd_coeff_y"])
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_z)
+            $:UPDATE(device=["fd_coeff_z"])
         end if
 
     end subroutine s_initialize_hyperelastic_module
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 430b69d5e5..e954eebbc8 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -56,7 +56,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:UPDATE(device=["Gs"])
 
         @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -69,16 +69,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, &
                                                       fd_number, fd_order)
-        !$acc update device(fd_coeff_x_h)
+        $:UPDATE(device=["fd_coeff_x_h"])
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_y_h)
+            $:UPDATE(device=["fd_coeff_y_h"])
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_z_h)
+            $:UPDATE(device=["fd_coeff_z_h"])
         end if
 
     end subroutine s_initialize_hypoelastic_module
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index a3538b2a7e..cd09f74d1f 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -81,9 +81,9 @@ contains
 
         integer :: i, j, k
 
-        !$acc update device(ib_markers%sf)
-        !$acc update device(levelset%sf)
-        !$acc update device(levelset_norm%sf)
+        $:UPDATE(device=["ib_markers%sf"])
+        $:UPDATE(device=["levelset%sf"])
+        $:UPDATE(device=["levelset_norm%sf"])
 
         ! Get neighboring IB variables from other processors
         call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
@@ -92,20 +92,20 @@ contains
 
         call s_find_num_ghost_points(num_gps, num_inner_gps)
 
-        !$acc update device(num_gps, num_inner_gps)
+        $:UPDATE(device=["num_gps", "num_inner_gps"])
         @:ALLOCATE(ghost_points(1:num_gps))
         @:ALLOCATE(inner_points(1:num_inner_gps))
 
         !$acc enter data copyin(ghost_points, inner_points)
 
         call s_find_ghost_points(ghost_points, inner_points)
-        !$acc update device(ghost_points, inner_points)
+        $:UPDATE(device=["ghost_points", "inner_points"])
 
         call s_compute_image_points(ghost_points, levelset, levelset_norm)
-        !$acc update device(ghost_points)
+        $:UPDATE(device=["ghost_points"])
 
         call s_compute_interpolation_coeffs(ghost_points)
-        !$acc update device(ghost_points)
+        $:UPDATE(device=["ghost_points"])
 
     end subroutine s_ibm_setup
 
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index a32412398c..6a6821e602 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -53,12 +53,12 @@ contains
 
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, fd_number, fd_order)
-        !$acc update device(fd_coeff_x_h)
+        $:UPDATE(device=["fd_coeff_x_h"])
         call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, fd_number, fd_order)
-        !$acc update device(fd_coeff_y_h)
+        $:UPDATE(device=["fd_coeff_y_h"])
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, fd_number, fd_order)
-            !$acc update device(fd_coeff_z_h)
+            $:UPDATE(device=["fd_coeff_z_h"])
         end if
 
     end subroutine s_initialize_mhd_powell_module
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f7c05cd07f..1eeec98361 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -909,7 +909,7 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
@@ -1038,7 +1038,7 @@ contains
             end if
 
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 
             ! Unpacking buffer received from bc_x%end
@@ -1175,7 +1175,7 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
@@ -1311,7 +1311,7 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
@@ -1450,7 +1450,7 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
@@ -1586,7 +1586,7 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
+                $:UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 863324b71d..912ab99e13 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -59,7 +59,7 @@ contains
             end if
 
             !$acc enter data copyin(nterms)
-            !$acc update device(nterms)
+            $:UPDATE(device=["nterms"])
 
         #:endif
 
@@ -393,7 +393,7 @@ contains
             end do
         end if
 
-        !$acc update device(momrhs)
+        $:UPDATE(device=["momrhs"])
 
         @:ALLOCATE(bubrs(1:nb))
         @:ALLOCATE(bubmoms(1:nb, 1:nmom))
@@ -401,14 +401,14 @@ contains
         do i = 1, nb
             bubrs(i) = bub_idx%rs(i)
         end do
-        !$acc update device(bubrs)
+        $:UPDATE(device=["bubrs"])
 
         do j = 1, nmom
             do i = 1, nb
                 bubmoms(i, j) = bub_idx%moms(i, j)
             end do
         end do
-        !$acc update device(bubmoms)
+        $:UPDATE(device=["bubmoms"])
 
     end subroutine s_initialize_qbmm_module
 
@@ -838,7 +838,7 @@ contains
 
         is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz
 
-        !$acc update device(is1_qbmm, is2_qbmm, is3_qbmm)
+        $:UPDATE(device=["is1_qbmm","is2_qbmm","is3_qbmm"])
 
         $:PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
             "abscY", "wght_pb", "wght_mv", "wght_ht", "coeff", "ht", "r", "q", &
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 54899dc0fa..cbc50298ba 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -180,7 +180,7 @@ contains
         integer :: num_eqns_after_adv
 
         !$acc enter data copyin(idwbuff, idwbuff)
-        !$acc update device(idwbuff, idwbuff)
+        $:UPDATE(device=["idwbuff", "idwbuff"])
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
         @:ALLOCATE(q_prim_qp%vf(1:sys_size))
@@ -578,7 +578,7 @@ contains
             gamma_min(i) = 1._wp/fluid_pp(i)%gamma + 1._wp
             pres_inf(i) = fluid_pp(i)%pi_inf/(1._wp + fluid_pp(i)%gamma)
         end do
-        !$acc update device(gamma_min, pres_inf)
+        $:UPDATE(device=["gamma_min", "pres_inf"])
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -590,7 +590,7 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            $:UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 
         $:PARALLEL_LOOP(collapse=4)
@@ -2005,7 +2005,7 @@ contains
 
         end if
 
-        !$acc update device(is1, is2, is3, iv)
+        $:UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
             !$acc parallel loop collapse(4) default(present)
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index a593a36c71..1f52925f00 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -3335,7 +3335,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:UPDATE(device=["Gs"])
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -3347,7 +3347,7 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            $:UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 
         !$acc enter data copyin(is1, is2, is3, isx, isy, isz)
@@ -3496,7 +3496,7 @@ contains
             dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
         end if
 
-        !$acc update device(is1, is2, is3)
+        $:UPDATE(device=["is1","is2","is3"])
 
         if (elasticity) then
             if (norm_dir == 1) then
@@ -3509,8 +3509,10 @@ contains
         end if
 
         isx = ix; isy = iy; isz = iz
-        !$acc update device(isx, isy, isz) ! for stuff in the same module
-        !$acc update device(dir_idx, dir_flg,  dir_idx_tau) ! for stuff in different modules
+        ! for stuff in the same module
+        $:UPDATE(device=["isx","isy","isz"])
+        ! for stuff in different modules
+        $:UPDATE(device=["dir_idx","dir_flg","dir_idx_tau"])
 
         ! Population of Buffers in x-direction
         if (norm_dir == 1) then
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 49f1a5e7d7..4f51c59898 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -275,17 +275,17 @@ contains
 
         bc_type(1, -1)%sf(:, :, :) = bc_x%beg
         bc_type(1, 1)%sf(:, :, :) = bc_x%end
-        !$acc update device(bc_type(1,-1)%sf, bc_type(1,1)%sf)
+        $:UPDATE(device=["bc_type(1,-1)%sf","bc_type(1,1)%sf"])
 
         if (n > 0) then
             bc_type(2, -1)%sf(:, :, :) = bc_y%beg
             bc_type(2, 1)%sf(:, :, :) = bc_y%end
-            !$acc update device(bc_type(2,-1)%sf, bc_type(2,1)%sf)
+            $:UPDATE(device=["bc_type(2,-1)%sf","bc_type(2,1)%sf"])
 
             if (p > 0) then
                 bc_type(3, -1)%sf(:, :, :) = bc_z%beg
                 bc_type(3, 1)%sf(:, :, :) = bc_z%end
-                !$acc update device(bc_type(3,-1)%sf, bc_type(3,1)%sf)
+                $:UPDATE(device=["bc_type(3,-1)%sf","bc_type(3,1)%sf"])
             end if
         end if
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index bf1d429714..859e30afe9 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_start_up
 
 #:include 'case.fpp'
+#:include 'directive_macros.fpp'
 
 !> @brief The purpose of the module is primarily to read in the files that
 !!              contain the inputs, the initial condition data and the grid data
@@ -1268,12 +1269,12 @@ contains
         if (cfl_dt) then
             if ((mytime + dt) >= t_stop) then
                 dt = t_stop - mytime
-                !$acc update device(dt)
+                $:UPDATE(device=["dt"])
             end if
         else
             if ((mytime + dt) >= finaltime) then
                 dt = finaltime - mytime
-                !$acc update device(dt)
+                $:UPDATE(device=["dt"])
             end if
         end if
 
@@ -1634,38 +1635,44 @@ contains
         integer :: i
         !Update GPU DATA
         do i = 1, sys_size
-            !$acc update device(q_cons_ts(1)%vf(i)%sf)
+            $:UPDATE(device=["q_cons_ts(1)%vf(i)%sf"])
         end do
 
         if (qbmm .and. .not. polytropic) then
-            !$acc update device(pb_ts(1)%sf, mv_ts(1)%sf)
+            $:UPDATE(device=["pb_ts(1)%sf","mv_ts(1)%sf"])
         end if
         if (chemistry) then
-            !$acc update device(q_T_sf%sf)
+            $:UPDATE(device=["q_T_sf%sf"])
         end if
-        !$acc update device(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma, adv_n, adap_dt, adap_dt_tol, n_idx, pi_fac, low_Mach)
-        !$acc update device(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN , mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
+        $:UPDATE(device=["nb","R0ref","Ca","Web","Re_inv","weight","R0","V0", &
+            & "bubbles_euler","polytropic","polydisperse","qbmm","R0_type", &
+            & "ptil","bubble_model","thermal","poly_sigma","adv_n","adap_dt", &
+            & "adap_dt_tol","n_idx","pi_fac","low_Mach"])
+        $:UPDATE(device=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv","M_n", &
+            & "M_v","k_n","k_v","pb0","mass_n0","mass_v0","Pe_T","Re_trans_T", &
+            & "Re_trans_c","Im_trans_T","Im_trans_c","omegaN","mul0","ss", &
+            & "gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
 
-        !$acc update device(acoustic_source, num_source)
-        !$acc update device(sigma, surface_tension)
+        $:UPDATE(device=["acoustic_source", "num_source"])
+        $:UPDATE(device=["sigma", "surface_tension"])
 
-        !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc)
+        $:UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
 
-        !$acc update device(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-        !$acc update device(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-        !$acc update device(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
+        $:UPDATE(device=["bc_x%vb1","bc_x%vb2","bc_x%vb3","bc_x%ve1","bc_x%ve2","bc_x%ve3"])
+        $:UPDATE(device=["bc_y%vb1","bc_y%vb2","bc_y%vb3","bc_y%ve1","bc_y%ve2","bc_y%ve3"])
+        $:UPDATE(device=["bc_z%vb1","bc_z%vb2","bc_z%vb3","bc_z%ve1","bc_z%ve2","bc_z%ve3"])
 
-        !$acc update device(bc_x%grcbc_in, bc_x%grcbc_out, bc_x%grcbc_vel_out)
-        !$acc update device(bc_y%grcbc_in, bc_y%grcbc_out, bc_y%grcbc_vel_out)
-        !$acc update device(bc_z%grcbc_in, bc_z%grcbc_out, bc_z%grcbc_vel_out)
+        $:UPDATE(device=["bc_x%grcbc_in","bc_x%grcbc_out","bc_x%grcbc_vel_out"])
+        $:UPDATE(device=["bc_y%grcbc_in","bc_y%grcbc_out","bc_y%grcbc_vel_out"])
+        $:UPDATE(device=["bc_z%grcbc_in","bc_z%grcbc_out","bc_z%grcbc_vel_out"])
 
-        !$acc update device(relax, relax_model)
+        $:UPDATE(device=["relax", "relax_model"])
         if (relax) then
-            !$acc update device(palpha_eps, ptgalpha_eps)
+            $:UPDATE(device=["palpha_eps", "ptgalpha_eps"])
         end if
 
         if (ib) then
-            !$acc update device(ib_markers%sf)
+            $:UPDATE(device=["ib_markers%sf"])
         end if
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 4ab6c23a7c..3f11f01443 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -337,7 +337,7 @@ contains
 
         end if
 
-        !$acc update device(is1, is2, is3, iv)
+        $:UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
             !$acc parallel loop collapse(4) default(present)
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index b19600c6fd..0196ce2f25 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -1006,7 +1006,7 @@ contains
             call s_mpi_allreduce_min(dt_local, dt)
         end if
 
-        !$acc update device(dt)
+        $:UPDATE(device=["dt"])
 
     end subroutine s_compute_dt
 
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 323444c1ee..e37f75d456 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -43,7 +43,7 @@ contains
                 Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
             end do
         end do
-        !$acc update device(Res_viscous, Re_idx, Re_size)
+        $:UPDATE(device=["Res_viscous","Re_idx","Re_size"])
         !$acc enter data copyin(is1_viscous, is2_viscous, is3_viscous, iv)
 
     end subroutine s_initialize_viscous_module
@@ -76,7 +76,7 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
         $:PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
@@ -550,7 +550,7 @@ contains
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
 
-            !$acc update device(iv)
+            $:UPDATE(device=["iv"])
 
             call s_reconstruct_cell_boundary_values_visc( &
                 q_prim_qp%vf(iv%beg:iv%end), &
@@ -588,11 +588,11 @@ contains
         else ! Compute velocity gradient at cell centers using finite differences
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
-            !$acc update device(iv)
+            $:UPDATE(device=["iv"])
 
             is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-            !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+            $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
             $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
@@ -1000,7 +1000,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         if (n > 0) then
             if (p > 0) then
@@ -1097,7 +1097,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         if (n > 0) then
             if (p > 0) then
@@ -1200,7 +1200,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then
@@ -1320,7 +1320,7 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
         $:PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 9c9778a405..0ca1775217 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -625,11 +625,11 @@ contains
         #:endfor
 
         if (weno_dir == 1) then
-            !$acc update device(poly_coef_cbL_x, poly_coef_cbR_x, d_cbL_x, d_cbR_x, beta_coef_x)
+            $:UPDATE(device=["poly_coef_cbL_x","poly_coef_cbR_x","d_cbL_x","d_cbR_x","beta_coef_x"])
         elseif (weno_dir == 2) then
-            !$acc update device(poly_coef_cbL_y, poly_coef_cbR_y, d_cbL_y, d_cbR_y, beta_coef_y)
+            $:UPDATE(device=["poly_coef_cbL_y","poly_coef_cbR_y","d_cbL_y","d_cbR_y","beta_coef_y"])
         else
-            !$acc update device(poly_coef_cbL_z, poly_coef_cbR_z, d_cbL_z, d_cbR_z, beta_coef_z)
+            $:UPDATE(device=["poly_coef_cbL_z","poly_coef_cbR_z","d_cbL_z","d_cbR_z","beta_coef_z"])
         end if
 
         ! Nullifying WENO coefficients and cell-boundary locations pointers
@@ -664,7 +664,7 @@ contains
         is2_weno = is2_weno_d
         is3_weno = is3_weno_d
 
-        !$acc update device(is1_weno, is2_weno, is3_weno)
+        $:UPDATE(device=["is1_weno","is2_weno","is3_weno"])
 
         if (weno_order /= 1) then
             call s_initialize_weno(v_vf, &
@@ -1135,7 +1135,7 @@ contains
         ! as to reshape the inputted data in the coordinate direction of
         ! the WENO reconstruction
         v_size = ubound(v_vf, 1)
-        !$acc update device(v_size)
+        $:UPDATE(device=["v_size"])
 
         if (weno_dir == 1) then
             $:PARALLEL_LOOP(collapse=4)

From 9814a90ffa6e56ca2eef1925403ab2f24a626afe Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 13:58:36 -0400
Subject: [PATCH 20/75] Added update host macros

---
 src/common/m_mpi_common.fpp        |  4 ++--
 src/simulation/m_bubbles_EL.fpp    | 16 +++++++++-------
 src/simulation/m_data_output.fpp   |  6 +++---
 src/simulation/m_ibm.fpp           |  2 +-
 src/simulation/m_mpi_proxy.fpp     | 24 ++++++++++++------------
 src/simulation/m_start_up.fpp      | 14 +++++++-------
 src/simulation/m_time_steppers.fpp |  4 ++--
 7 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index c580281ef2..10398001fa 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -860,7 +860,7 @@ $:UPDATE(device=["v_size"])
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    !$acc update host(buff_send)
+                    $:UPDATE(host=["buff_send"])
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
@@ -1190,7 +1190,7 @@ $:UPDATE(device=["v_size"])
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    !$acc update host(buff_send)
+                    $:UPDATE(host=["buff_send"])
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 7f9d63c994..14ecf35355 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -251,9 +251,11 @@ contains
 
         $:UPDATE(device=["bubbles_lagrange", "lag_params"])
 
-        !$acc update device(lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC,   &
-        !$acc bub_dphidt, gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel,       &
-        !$acc mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, nBubs)
+        $:UPDATE(device=["lag_id","bub_R0","Rmax_stats","Rmin_stats","gas_mg", &
+            & "gas_betaT","gas_betaC","bub_dphidt","gas_p","gas_mv", &
+            & "intfc_rad","intfc_vel","mtn_pos","mtn_posPrev","mtn_vel", &
+            & "mtn_s","intfc_draddt","intfc_dveldt","gas_dpdt","gas_dmvdt", &
+            & "mtn_dposdt","mtn_dveldt","nBubs"])
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
@@ -1044,7 +1046,7 @@ contains
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
             if (lag_params%write_bubbles) then
-                !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                 call s_write_lag_particles(mytime)
             end if
 
@@ -1078,7 +1080,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1125,7 +1127,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1623,7 +1625,7 @@ contains
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
 
-        !$acc update host(Rmax_glb, Rmin_glb)
+        $:UPDATE(host=["Rmax_glb","Rmin_glb"])
 
         open (13, FILE=trim(file_loc), FORM='formatted', position='rewind')
         write (13, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 7ff2756709..e8f4c92f06 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -303,10 +303,10 @@ contains
         ! Determining local stability criteria extrema at current time-step
 
 #ifdef _CRAYFTN
-        !$acc update host(icfl_sf)
+        $:UPDATE(host=["icfl_sf"])
 
         if (viscous) then
-            !$acc update host(vcfl_sf, Rc_sf)
+            $:UPDATE(host=["vcfl_sf","Rc_sf"])
         end if
 
         icfl_max_loc = maxval(icfl_sf)
@@ -529,7 +529,7 @@ contains
         if (prim_vars_wrt .or. (n == 0 .and. p == 0)) then
             call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
             do i = 1, sys_size
-                !$acc update host(q_prim_vf(i)%sf(:,:,:))
+                $:UPDATE(host=["q_prim_vf(i)%sf(:,:,:)"])
             end do
             ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
             if (qbmm) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index cd09f74d1f..57ce03c183 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -88,7 +88,7 @@ contains
         ! Get neighboring IB variables from other processors
         call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
 
-        !$acc update host(ib_markers%sf)
+        $:UPDATE(host=["ib_markers%sf"])
 
         call s_find_num_ghost_points(num_gps, num_inner_gps)
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 1eeec98361..37882ec1ab 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -835,7 +835,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send, ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send","ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -889,7 +889,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -967,7 +967,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
@@ -1020,7 +1020,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
@@ -1099,7 +1099,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1155,7 +1155,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1235,7 +1235,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1291,7 +1291,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1374,7 +1374,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1430,7 +1430,7 @@ contains
                 else
 #endif
 
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1510,7 +1510,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1566,7 +1566,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    !$acc update host(ib_buff_send)
+                    $:UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 859e30afe9..208f0524e6 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1298,7 +1298,7 @@ contains
 
         if (probe_wrt) then
             do i = 1, sys_size
-                !$acc update host(q_cons_ts(1)%vf(i)%sf)
+                $:UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
             end do
         end if
 
@@ -1404,7 +1404,7 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 1, sys_size
-            !$acc update host(q_cons_ts(1)%vf(i)%sf)
+            $:UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1418,8 +1418,8 @@ contains
         end do
 
         if (qbmm .and. .not. polytropic) then
-            !$acc update host(pb_ts(1)%sf)
-            !$acc update host(mv_ts(1)%sf)
+            $:UPDATE(host=["pb_ts(1)%sf"])
+            $:UPDATE(host=["mv_ts(1)%sf"])
         end if
 
         if (cfl_dt) then
@@ -1429,16 +1429,16 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            !$acc update host(intfc_rad)
+            $:UPDATE(host=["intfc_rad"])
             do i = 1, nBubs
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
             end do
 
-            !$acc update host(q_beta%vf(1)%sf)
+            $:UPDATE(host=["q_beta%vf(1)%sf"])
             call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, q_beta%vf(1))
-            !$acc update host(Rmax_stats, Rmin_stats, gas_p, gas_mv, intfc_vel)
+            $:UPDATE(host=["Rmax_stats","Rmin_stats","gas_p","gas_mv","intfc_vel"])
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 0196ce2f25..15fd95e52c 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -948,7 +948,7 @@ contains
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
                 call s_write_void_evol(mytime)
@@ -1051,7 +1051,7 @@ contains
         integer :: i !< Generic loop iterator
 
         do i = 1, sys_size
-            !$acc update host(q_prim_vf(i)%sf)
+            $:UPDATE(host=["q_prim_vf(i)%sf"])
         end do
 
         if (t_step == t_step_start) then

From 2bcfa220b74e9915b5f7cb60c9e6cc632774d853 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 16:50:43 -0400
Subject: [PATCH 21/75] Replaced rest of parallel loop directives

---
 src/common/m_mpi_common.fpp             | 48 ++++++++++-----------
 src/simulation/m_acoustic_src.fpp       |  2 +-
 src/simulation/m_bubbles_EL.fpp         | 25 +++++------
 src/simulation/m_bubbles_EL_kernels.fpp |  4 +-
 src/simulation/m_hypoelastic.fpp        |  2 +-
 src/simulation/m_ibm.fpp                | 10 ++++-
 src/simulation/m_mpi_proxy.fpp          | 36 ++++++++--------
 src/simulation/m_qbmm.fpp               |  8 ++--
 src/simulation/m_rhs.fpp                | 56 ++++++++++++-------------
 src/simulation/m_surface_tension.fpp    |  6 +--
 src/simulation/m_viscous.fpp            | 10 ++---
 src/simulation/m_weno.fpp               | 14 +++----
 12 files changed, 114 insertions(+), 107 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 10398001fa..19bd461d5c 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -697,7 +697,7 @@ $:UPDATE(device=["v_size"])
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -711,7 +711,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=4,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -726,7 +726,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -743,7 +743,7 @@ $:UPDATE(device=["v_size"])
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -759,7 +759,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -775,7 +775,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -793,7 +793,7 @@ $:UPDATE(device=["v_size"])
                     end if
 #endif
                 #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -809,7 +809,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -825,7 +825,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -895,7 +895,7 @@ $:UPDATE(device=["v_size"])
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -916,7 +916,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -931,7 +931,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -948,7 +948,7 @@ $:UPDATE(device=["v_size"])
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, p
                             do k = -buff_size, -1
@@ -970,7 +970,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -986,7 +986,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -1005,7 +1005,7 @@ $:UPDATE(device=["v_size"])
 #endif
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -1028,7 +1028,7 @@ $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1045,7 +1045,7 @@ $:UPDATE(device=["v_size"])
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1135,7 +1135,7 @@ $:UPDATE(device=["v_size"])
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -1148,7 +1148,7 @@ $:UPDATE(device=["v_size"])
                     end do
 
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -1163,7 +1163,7 @@ $:UPDATE(device=["v_size"])
                     end do
 
                 #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -1224,7 +1224,7 @@ $:UPDATE(device=["v_size"])
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -1244,7 +1244,7 @@ $:UPDATE(device=["v_size"])
                     end do
 
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, p
                             do k = -buff_size, -1
@@ -1266,7 +1266,7 @@ $:UPDATE(device=["v_size"])
 
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index a779c0e5d5..d7186d75a3 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -220,7 +220,7 @@ contains
 
             deallocate (phi_rn)
 
-            !$acc parallel loop gang vector default(present) private(myalpha, myalpha_rho)
+            $:PARALLEL_LOOP(private=["myalpha","myalpha_rho"])
             do i = 1, num_points
                 j = source_spatials(ai)%coord(1, i)
                 k = source_spatials(ai)%coord(2, i)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 14ecf35355..a1746ddac6 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -530,7 +530,7 @@ contains
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
             ! Calculate velocity potentials (valid for one bubble per cell)
-            !$acc parallel loop gang vector default(present) private(k, cell)
+            $:PARALLEL_LOOP(private=["k","cell"])
             do k = 1, nBubs
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
@@ -551,8 +551,9 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        !$acc parallel loop gang vector default(present) private(k, myalpha_rho, myalpha, Re, cell) &
-        !$acc reduction(MAX:adap_dt_stop_max) copy(adap_dt_stop_max) copyin(stage)
+        $:PARALLEL_LOOP(private=["k","myalpha_rho","myalpha","Re","cell"], &
+            & reduction=["adap_dt_stop_max"],reductionOp="MAX", &
+            & copy=["adap_dt_stop_max"],copyin=["stage"])
         do k = 1, nBubs
             ! Keller-Miksis model
 
@@ -1030,7 +1031,7 @@ contains
         integer :: k
 
         if (time_stepper == 1) then ! 1st order TVD RK
-            !$acc parallel loop gang vector default(present) private(k)
+            $:PARALLEL_LOOP(private=["k"])
             do k = 1, nBubs
                 !u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1052,7 +1053,7 @@ contains
 
         elseif (time_stepper == 2) then ! 2nd order TVD RK
             if (stage == 1) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1064,7 +1065,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
@@ -1088,7 +1089,7 @@ contains
 
         elseif (time_stepper == 3) then ! 3rd order TVD RK
             if (stage == 1) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1100,7 +1101,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
@@ -1111,7 +1112,7 @@ contains
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
                 end do
             elseif (stage == 3) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3))
@@ -1197,7 +1198,7 @@ contains
 
         integer :: k
 
-        !$acc parallel loop gang vector default(present) private(k)
+        $:PARALLEL_LOOP(private=["k"])
         do k = 1, nBubs
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
@@ -1605,8 +1606,8 @@ contains
 
         integer :: k
 
-        !$acc parallel loop gang vector default(present) reduction(MAX:Rmax_glb) &
-        !$acc reduction(MIN: Rmin_glb) copy(Rmax_glb, Rmin_glb)
+        $:PARALLEL_LOOP(reduction=[["Rmax_glb"], ["Rmin_glb"]], &
+            & reductionOp=["MAX", "MIN"], copy=["Rmax_glb","Rmin_glb"])
         do k = 1, nBubs
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k))
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index c173c82953..ae00f42a25 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -56,7 +56,7 @@ contains
         real(wp), dimension(3) :: s_coord
         integer :: l
 
-        !$acc parallel loop gang vector default(present) private(l, s_coord, cell)
+        $:PARALLEL_LOOP(private=["l","s_coord","cell"])
         do l = 1, nBubs
 
             volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
@@ -121,7 +121,7 @@ contains
         smearGridz = smearGrid
         if (p == 0) smearGridz = 1
 
-        !$acc parallel loop gang vector default(present) private(nodecoord, l, s_coord, cell, center) copyin(smearGrid, smearGridz)
+        $:PARALLEL_LOOP(private=["nodecoord","l","s_coord","cell","center"], copyin=["smearGrid","smearGridz"])
         do l = 1, nBubs
             nodecoord(1:3) = 0
             center(1:3) = 0._wp
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index e954eebbc8..16c3008687 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -400,7 +400,7 @@ contains
 
         if (n == 0) then
             l = 0; q = 0
-            !$acc parallel loop collapse(1) gang vector default(present)
+            $:PARALLEL_LOOP()
             do k = 0, m
                 rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(abs(q_cons_vf(stress_idx%beg)%sf(k, l, q)) - tau_star, 0._wp))**cont_damage_s
             end do
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 57ce03c183..f4b83f9598 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -153,7 +153,11 @@ contains
         type(ghost_point) :: gp
         type(ghost_point) :: innerp
 
-        !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, vel_g, vel_norm_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, presb_IP, massv_IP, rho, gamma, pi_inf, Re_K, G_K, Gs, gp, innerp, norm, buf, j, k, l, q, coeff)
+        $:PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
+            & "alpha_IP","pres_IP","vel_IP","vel_g","vel_norm_IP","r_IP", &
+            & "v_IP","pb_IP","mv_IP","nmom_IP","presb_IP","massv_IP","rho", &
+            & "gamma","pi_inf","Re_K","G_K","Gs","gp","innerp","norm","buf", &
+            & "j","k","l","q","coeff"])
         do i = 1, num_gps
 
             gp = ghost_points(i)
@@ -297,7 +301,9 @@ contains
         end do
 
         !Correct the state of the inner points in IBs
-        !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, vel_g, rho, gamma, pi_inf, Re_K, innerp, j, k, l, q)
+        $:PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
+            & "alpha_IP","vel_g","rho","gamma","pi_inf","Re_K","innerp", &
+            & "j","k","l","q"])
         do i = 1, num_inner_gps
 
             vel_g = 0._wp
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 37882ec1ab..f271cb696e 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -799,7 +799,7 @@ contains
             if (bc_x%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -854,7 +854,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_x%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = 0, gp_layers - 1
@@ -914,7 +914,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_x%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = -gp_layers, -1
@@ -930,7 +930,7 @@ contains
 
             if (bc_x%beg >= 0) then      ! PBC at the end and beginning
 
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 ! Packing buffer to be sent to bc_x%beg
                 do l = 0, p
                     do k = 0, n
@@ -984,7 +984,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -1042,7 +1042,7 @@ contains
             end if
 
             ! Unpacking buffer received from bc_x%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = m + 1, m + gp_layers
@@ -1062,7 +1062,7 @@ contains
             if (bc_y%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1118,7 +1118,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1180,7 +1180,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_y%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = -gp_layers, -1
                     do j = -gp_layers, m + gp_layers
@@ -1198,7 +1198,7 @@ contains
             if (bc_y%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1254,7 +1254,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1316,7 +1316,7 @@ contains
 #endif
 
             ! Unpacking buffer received form bc_y%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = n + 1, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1336,7 +1336,7 @@ contains
             if (bc_z%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1393,7 +1393,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1455,7 +1455,7 @@ contains
 #endif
 
             ! Unpacking buffer from bc_z%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = -gp_layers, -1
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1474,7 +1474,7 @@ contains
             if (bc_z%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1529,7 +1529,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                $:PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1591,7 +1591,7 @@ contains
 #endif
 
             ! Unpacking buffer received from bc_z%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
+            $:PARALLEL_LOOP(collapse=3,private=["r"])
             do l = p + 1, p + gp_layers
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 912ab99e13..9dc18d1771 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -429,7 +429,7 @@ contains
 
             !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
             if (.not. polytropic) then
-                !$acc parallel loop collapse(5) gang vector default(present) private(nb_q, nR, nR2, R, R2, nb_dot, nR_dot, nR2_dot, var, AX)
+                $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
                 do i = 1, nb
                     do q = 1, nnode
                         do l = 0, p
@@ -513,7 +513,7 @@ contains
 
             !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
             if (.not. polytropic) then
-                !$acc parallel loop collapse(5) gang vector default(present) private(nb_q, nR, nR2, R, R2, nb_dot, nR_dot, nR2_dot, var, AX)
+                $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
                 do i = 1, nb
                     do q = 1, nnode
                         do l = 0, p
@@ -570,7 +570,7 @@ contains
             if (.not. polytropic) then
                 if (grid_geometry == 3) then
                     !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-                    !$acc parallel loop collapse(5) gang vector default(present) private(nb_q, nR, nR2, R, R2, nb_dot, nR_dot, nR2_dot, var, AX)
+                    $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
                     do i = 1, nb
                         do q = 1, nnode
                             do l = 0, p
@@ -621,7 +621,7 @@ contains
                     end do
                 else
                     !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-                    !$acc parallel loop collapse(5) gang vector default(present) private(nb_q, nR, nR2, R, R2, nb_dot, nR_dot, nR2_dot, var, AX)
+                    $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
                     do i = 1, nb
                         do q = 1, nnode
                             do l = 0, p
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index cbc50298ba..c31095f6ca 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -1010,7 +1010,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+            $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
             do j = 1, sys_size
                 do q_loop = 0, p
                     do l_loop = 0, n
@@ -1025,8 +1025,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                    & "pressure_val","flux_face1","flux_face2"])
                 do q_loop = 0, p
                     do l_loop = 0, n
                         do k_loop = 0, m
@@ -1055,7 +1055,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+            $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
             do j = 1, sys_size
                 do l = 0, p
                     do k = 0, n
@@ -1070,8 +1070,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                    & "pressure_val","flux_face1","flux_face2"])
                 do l = 0, p
                     do k = 0, n
                         do q = 0, m
@@ -1096,7 +1096,7 @@ contains
             end if
 
             if (cyl_coord) then
-                !$acc parallel loop collapse(4) gang vector default(present) private(flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do l = 0, p
                         do k = 0, n
@@ -1122,8 +1122,8 @@ contains
             end if
 
             if (grid_geometry == 3) then ! Cylindrical Coordinates
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, velocity_val, flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","velocity_val", &
+                    & "flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1138,7 +1138,7 @@ contains
                         end do
                     end do
                 end do
-                !$acc parallel loop collapse(4) gang vector default(present) private(flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1152,7 +1152,7 @@ contains
                     end do
                 end do
             else ! Cartesian Coordinates
-                !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1168,8 +1168,8 @@ contains
             end if
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                    & "pressure_val","flux_face1","flux_face2"])
                 do k = 0, p
                     do q = 0, n
                         do l = 0, m
@@ -1213,8 +1213,8 @@ contains
             case (1) ! x-direction
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do q_idx = 0, p ! z_extent
                             do l_idx = 0, n ! y_extent
@@ -1262,8 +1262,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                            & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
@@ -1280,8 +1280,8 @@ contains
             case (2) ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do l_idx = 0, p ! z_extent
                             do k_idx = 0, n ! y_extent
@@ -1338,8 +1338,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                            & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
@@ -1361,8 +1361,8 @@ contains
                 end if
 
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do k_idx = 0, p ! z_extent
                             do q_idx = 0, n ! y_extent
@@ -1411,8 +1411,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                            & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
@@ -2008,7 +2008,7 @@ contains
         $:UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -2021,7 +2021,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 2) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -2034,7 +2034,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 3) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 3f11f01443..97c987327d 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -340,7 +340,7 @@ contains
         $:UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -353,7 +353,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 2) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -366,7 +366,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 3) then
-            !$acc parallel loop collapse(4) default(present)
+            $:PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index e37f75d456..07815a9c46 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1000,7 +1000,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         if (n > 0) then
             if (p > 0) then
@@ -1097,7 +1097,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         if (n > 0) then
             if (p > 0) then
@@ -1200,7 +1200,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then
@@ -1334,7 +1334,7 @@ contains
         end do
 
         if (n > 0) then
-            !$acc parallel loop collapse(3) gang vector
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1347,7 +1347,7 @@ contains
         end if
 
         if (p > 0) then
-            !$acc parallel loop collapse(3) gang vector
+            $:PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 0ca1775217..1db77c8b45 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -673,7 +673,7 @@ contains
 
         if (weno_order == 1) then
             if (weno_dir == 1) then
-                !$acc parallel loop collapse(4) default(present)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -686,7 +686,7 @@ contains
                 end do
                 !$acc end parallel loop
             else if (weno_dir == 2) then
-                !$acc parallel loop collapse(4) default(present)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -699,7 +699,7 @@ contains
                 end do
                 !$acc end parallel loop
             else if (weno_dir == 3) then
-                !$acc parallel loop collapse(4) default(present)
+                $:PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -715,7 +715,7 @@ contains
         elseif (weno_order == 3) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop collapse(4) gang vector default(present) private(beta,dvd,poly,omega,alpha,tau)
+                    $:PARALLEL_LOOP(collapse=4,private=["beta","dvd","poly","omega","alpha","tau"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -794,7 +794,7 @@ contains
         elseif (weno_order == 5) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop vector gang collapse(3) default(present) private(dvd, poly, beta, alpha, omega, tau, delta)
+                    $:PARALLEL_LOOP(collapse=3,private=["dvd","poly","beta","alpha","omega","tau","delta"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -914,7 +914,7 @@ contains
         elseif (weno_order == 7) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop vector gang collapse(3) default(present) private(poly, beta, alpha, omega, tau, delta, dvd, v)
+                    $:PARALLEL_LOOP(collapse=3,private=["poly","beta","alpha","omega","tau","delta","dvd","v"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -1269,7 +1269,7 @@ contains
         real(wp), parameter :: alpha_mp = 2._wp
         real(wp), parameter :: beta_mp = 4._wp/3._wp
 
-        !$acc parallel loop gang vector collapse (4)  default(present) private(d)
+        $:PARALLEL_LOOP(collapse=4,private=["d"])
         do l = is3_weno%beg, is3_weno%end
             do k = is2_weno%beg, is2_weno%end
                 do j = is1_weno%beg, is1_weno%end

From b4afb9f552a21c0724ed133de725b69d70835f31 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 16:55:39 -0400
Subject: [PATCH 22/75] Ran formatter

---
 src/common/m_mpi_common.fpp            | 2 +-
 src/common/m_variables_conversion.fpp  | 2 +-
 src/simulation/m_global_parameters.fpp | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 19bd461d5c..2d1b079467 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -643,7 +643,7 @@ contains
 #ifdef MFC_MPI
 
         call nvtxStartRange("RHS-COMM-PACKBUF")
-$:UPDATE(device=["v_size"])
+        $:UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
         if (qbmm .and. .not. polytropic) then
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 7c49767723..796dcb5003 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -653,7 +653,7 @@ contains
             qvs(i) = fluid_pp(i)%qv
             qvps(i) = fluid_pp(i)%qvp
         end do
-$:UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
+        $:UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
 
 #ifdef MFC_SIMULATION
 
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index b93f023d05..e2e14286d7 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -311,7 +311,6 @@ module m_global_parameters
 
     $:DECLARE(create=["buff_size"])
 
-
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
     !! Indices of the stress components that represent shear stress

From 764c71db495e37e7b0f23e9b9af62db2ef72ab5b Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 16:05:25 -0400
Subject: [PATCH 23/75] Small refactor of QBMM subroutines (#856)

---
 src/simulation/m_qbmm.fpp | 641 ++++++++++++++------------------------
 1 file changed, 241 insertions(+), 400 deletions(-)

diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 9dc18d1771..c5338bfed0 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -422,258 +422,143 @@ contains
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
 
         integer :: i, j, k, l, q
-
         real(wp) :: nb_q, nb_dot, R, R2, nR, nR2, nR_dot, nR2_dot, var, AX
+        logical :: is_axisym
 
-        if (idir == 1) then
-
-            !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-            if (.not. polytropic) then
-                $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
-                do i = 1, nb
-                    do q = 1, nnode
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 0, m
-                                    nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
-                                    nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                    nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                    R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                    R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                    if (R2 - R**2._wp > 0._wp) then
-                                        var = R2 - R**2._wp
-                                    else
-                                        var = verysmall
-                                    end if
-
-                                    if (q <= 2) then
-                                        AX = R - sqrt(var)
-                                    else
-                                        AX = R + sqrt(var)
-                                    end if
+        select case (idir)
+        case (1)
+            is_axisym = .false.
+        case (2)
+            is_axisym = .false.
+        case (3)
+            is_axisym = (grid_geometry == 3)
+        end select
 
+        if (.not. polytropic) then
+            $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
+            do i = 1, nb
+                do q = 1, nnode
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, m
+                                nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
+                                nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
+                                nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
+                                R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
+                                R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
+                                var = max(R2 - R**2._wp, verysmall)
+                                if (q <= 2) then
+                                    AX = R - sqrt(var)
+                                else
+                                    AX = R + sqrt(var)
+                                end if
+                                select case (idir)
+                                case (1)
                                     nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
                                     nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
                                     nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
                                     rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2)* &
                                                             (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i))
-
-                                    if (q <= 2) then
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-
-                                    else
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-                                    end if
-
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-            end if
-
-            $:PARALLEL_LOOP(collapse=3)
-            do l = 0, p
-                do q = 0, n
-                    do i = 0, m
-
-                        rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
-                        j = bubxb
-                        $:LOOP()
-                        do k = 1, nb
-                            rhs_vf(j)%sf(i, q, l) = &
-                                rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
-                            rhs_vf(j + 1)%sf(i, q, l) = &
-                                rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l)
-                            rhs_vf(j + 2)%sf(i, q, l) = &
-                                rhs_vf(j + 2)%sf(i, q, l) + mom_3d(0, 1, k)%sf(i, q, l)
-                            rhs_vf(j + 3)%sf(i, q, l) = &
-                                rhs_vf(j + 3)%sf(i, q, l) + mom_3d(2, 0, k)%sf(i, q, l)
-                            rhs_vf(j + 4)%sf(i, q, l) = &
-                                rhs_vf(j + 4)%sf(i, q, l) + mom_3d(1, 1, k)%sf(i, q, l)
-                            rhs_vf(j + 5)%sf(i, q, l) = &
-                                rhs_vf(j + 5)%sf(i, q, l) + mom_3d(0, 2, k)%sf(i, q, l)
-                            j = j + 6
-                        end do
-
-                    end do
-                end do
-            end do
-
-        elseif (idir == 2) then
-
-            !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-            if (.not. polytropic) then
-                $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
-                do i = 1, nb
-                    do q = 1, nnode
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 0, m
-                                    nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
-                                    nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                    nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                    R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                    R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                    if (R2 - R**2._wp > 0._wp) then
-                                        var = R2 - R**2._wp
-                                    else
-                                        var = verysmall
-                                    end if
-
-                                    if (q <= 2) then
-                                        AX = R - sqrt(var)
-                                    else
-                                        AX = R + sqrt(var)
-                                    end if
-
+                                case (2)
                                     nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
                                     nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
                                     nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
                                     rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2)* &
                                                             (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i))
-
-                                    if (q <= 2) then
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-
-                                    else
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
-                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-                                    end if
-
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-            end if
-
-        elseif (idir == 3) then
-
-            if (.not. polytropic) then
-                if (grid_geometry == 3) then
-                    !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-                    $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 0, m
-                                        nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
-                                        nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                        nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                        R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                        R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                        if (R2 - R**2._wp > 0._wp) then
-                                            var = R2 - R**2._wp
-                                        else
-                                            var = verysmall
-                                        end if
-
-                                        if (q <= 2) then
-                                            AX = R - sqrt(var)
-                                        else
-                                            AX = R + sqrt(var)
-                                        end if
-
+                                case (3)
+                                    if (is_axisym) then
                                         nb_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l))
                                         nR_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l))
                                         nR2_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l))
-
                                         rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2)* &
                                                                 (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i))
-
-                                        if (q <= 2) then
+                                    else
+                                        nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
+                                        nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
+                                        nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2)* &
+                                                                (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i))
+                                    end if
+                                end select
+                                if (q <= 2) then
+                                    select case (idir)
+                                    case (1)
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
+                                    case (2)
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
+                                    case (3)
+                                        if (is_axisym) then
                                             rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
                                             rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-
                                         else
-                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
                                         end if
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                else
-                    !Non-polytropic qbmm needs to account for change in bubble radius due to a change in nb
-                    $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 0, m
-                                        nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
-                                        nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                        nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                        R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                        R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                        if (R2 - R**2._wp > 0._wp) then
-                                            var = R2 - R**2._wp
-                                        else
-                                            var = verysmall
-                                        end if
-
-                                        if (q <= 2) then
-                                            AX = R - sqrt(var)
-                                        else
-                                            AX = R + sqrt(var)
-                                        end if
-
-                                        nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)
-                                        nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)
-                                        nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)
-
-                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2)* &
-                                                                (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i))
-
-                                        if (q <= 2) then
-                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                    end select
+                                else
+                                    select case (idir)
+                                    case (1)
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
+                                    case (2)
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
+                                        rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                                                (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
+                                    case (3)
+                                        if (is_axisym) then
+                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
-                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
+                                            rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
-
                                         else
                                             rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i))
                                             rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* &
                                                                     (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i))
                                         end if
-
-                                    end do
-                                end do
+                                    end select
+                                end if
                             end do
                         end do
                     end do
-                end if
-            end if
+                end do
+            end do
+        end if
 
+        ! The following block is not repeated and is left as is
+        if (idir == 1) then
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do l = 0, p
+                do q = 0, n
+                    do i = 0, m
+                        rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
+                        j = bubxb
+                        !$acc loop seq
+                        do k = 1, nb
+                            rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
+                            rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l)
+                            rhs_vf(j + 2)%sf(i, q, l) = rhs_vf(j + 2)%sf(i, q, l) + mom_3d(0, 1, k)%sf(i, q, l)
+                            rhs_vf(j + 3)%sf(i, q, l) = rhs_vf(j + 3)%sf(i, q, l) + mom_3d(2, 0, k)%sf(i, q, l)
+                            rhs_vf(j + 4)%sf(i, q, l) = rhs_vf(j + 4)%sf(i, q, l) + mom_3d(1, 1, k)%sf(i, q, l)
+                            rhs_vf(j + 5)%sf(i, q, l) = rhs_vf(j + 5)%sf(i, q, l) + mom_3d(0, 2, k)%sf(i, q, l)
+                            j = j + 6
+                        end do
+                    end do
+                end do
+            end do
         end if
 
     end subroutine s_compute_qbmm_rhs
@@ -760,7 +645,7 @@ contains
         $:ROUTINE()
 #endif
 
-        real(wp), intent(inout) :: pres, rho, c
+        real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
 
         integer :: i1, i2
@@ -831,13 +716,9 @@ contains
         real(wp), dimension(nterms, 0:2, 0:2) :: coeff
         real(wp) :: pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T
         real(wp) :: n_tait, B_tait
-
-        integer :: id1, id2, id3
-        integer :: i1, i2
-        integer :: j, q, r
+        integer :: id1, id2, id3, i1, i2, j, q, r
 
         is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz
-
         $:UPDATE(device=["is1_qbmm","is2_qbmm","is3_qbmm"])
 
         $:PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
@@ -851,73 +732,52 @@ contains
                     alf = q_prim_vf(alf_idx)%sf(id1, id2, id3)
                     pres = q_prim_vf(E_idx)%sf(id1, id2, id3)
                     rho = q_prim_vf(contxb)%sf(id1, id2, id3)
+
                     if (bubble_model == 2) then
-                        n_tait = gammas(1)
-                        n_tait = 1._wp/n_tait + 1._wp !make this the usual little 'gamma'
+                        n_tait = 1._wp/gammas(1) + 1._wp
                         B_tait = pi_infs(1)*(n_tait - 1)/n_tait
                         c = n_tait*(pres + B_tait)*(1._wp - alf)/(rho)
-
-                        if (c > 0._wp) then
-                            c = sqrt(c)
-                        else
-                            c = sgm_eps
-                        end if
+                        c = merge(sqrt(c), sgm_eps, c > 0._wp)
                     end if
 
-                    if (polytropic) then
-                        call s_coeff(pres, rho, c, coeff)
-                    else
-                        call s_coeff_nonpoly(pres, rho, c, coeff)
-                    end if
+                    call s_coeff_selector(pres, rho, c, coeff, polytropic)
 
-                    ! SHB: Manually adjusted pressure here for no-coupling case
-                    ! pres = 1._wp/0.3_wp
                     if (alf > small_alf) then
-
                         nbub = q_cons_vf(bubxb)%sf(id1, id2, id3)
-
                         $:LOOP()
                         do q = 1, nb
-                            !Initialize moment set for each R0 bin
+                            ! Gather moments for this bubble bin
                             $:LOOP()
                             do r = 2, nmom
                                 moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3)
                             end do
-
                             moms(1) = 1._wp
-
                             call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q))
 
                             if (polytropic) then
-                                !Account for bubble pressure pb0 at each R0 bin
                                 $:LOOP()
                                 do j = 1, nnode
                                     wght_pb(j, q) = wght(j, q)*(pb0(q) - pv)
                                 end do
                             else
-                                !Account for bubble pressure, mass transfer rate and heat transfer rate in wght_pb, wght_mv and wght_ht using Preston model
                                 $:LOOP()
                                 do j = 1, nnode
                                     chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp))
                                     x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw)
-                                    k_mw = x_vw*k_v(q)/(x_vw + (1._wp - x_vw)*phi_vn) &
-                                           + (1._wp - x_vw)*k_n(q)/(x_vw*phi_nv + 1._wp - x_vw)
+                                    k_mw = x_vw*k_v(q)/(x_vw + (1._wp - x_vw)*phi_vn) + (1._wp - x_vw)*k_n(q)/(x_vw*phi_nv + 1._wp - x_vw)
                                     rho_mw = pv/(chi_vw*R_v*Tw)
                                     rhs_mv(id1, id2, id3, j, q) = -Re_trans_c(q)*((mv(id1, id2, id3, j, q)/(mv(id1, id2, id3, j, q) + mass_n0(q))) - chi_vw)
                                     rhs_mv(id1, id2, id3, j, q) = rho_mw*rhs_mv(id1, id2, id3, j, q)/Pe_c/(1._wp - chi_vw)/abscX(j, q)
-
-                                    T_bar = Tw*(pb(id1, id2, id3, j, q)/pb0(q))*(abscX(j, q)/R0(q))**3 &
-                                            *(mass_n0(q) + mass_v0(q))/(mass_n0(q) + mv(id1, id2, id3, j, q))
+                                    T_bar = Tw*(pb(id1, id2, id3, j, q)/pb0(q))*(abscX(j, q)/R0(q))**3*(mass_n0(q) + mass_v0(q))/(mass_n0(q) + mv(id1, id2, id3, j, q))
                                     grad_T = -Re_trans_T(q)*(T_bar - Tw)
                                     ht(j, q) = pb0(q)*k_mw*grad_T/Pe_T(q)/abscX(j, q)
-
                                     wght_pb(j, q) = wght(j, q)*(pb(id1, id2, id3, j, q))
                                     wght_mv(j, q) = wght(j, q)*(rhs_mv(id1, id2, id3, j, q))
                                     wght_ht(j, q) = wght(j, q)*ht(j, q)
                                 end do
                             end if
 
-                            !Compute change in moments due to bubble dynamics
+                            ! Compute change in moments due to bubble dynamics
                             r = 1
                             $:LOOP()
                             do i2 = 0, 2
@@ -927,67 +787,45 @@ contains
                                         momsum = 0._wp
                                         $:LOOP()
                                         do j = 1, nterms
-                                            ! Account for term with pb in Rayleigh Plesset equation
-                                            if (bubble_model == 3 .and. j == 3) then
-                                                momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q)) &
-                                                         *f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q))
-                                                ! Account for terms with pb in Keller-Miksis equation
-                                            else if (bubble_model == 2 .and. ((j >= 7 .and. j <= 9) .or. (j >= 22 .and. j <= 23) .or. (j >= 10 .and. j <= 11) .or. (j == 26))) then
-                                                momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q)) &
-                                                         *f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q))
-                                                ! Account for terms with mass transfer rate in Keller-Miksis equation
-                                            else if (bubble_model == 2 .and. (j >= 27 .and. j <= 29) .and. (.not. polytropic)) then
-                                                momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q)) &
-                                                         *f_quad2D(abscX(:, q), abscY(:, q), wght_mv(:, q), momrhs(:, i1, i2, j, q))
-                                                ! Account for terms with heat transfer rate in Keller-Miksis equation
-                                            else if (bubble_model == 2 .and. (j >= 30 .and. j <= 32) .and. (.not. polytropic)) then
-                                                momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q)) &
-                                                         *f_quad2D(abscX(:, q), abscY(:, q), wght_ht(:, q), momrhs(:, i1, i2, j, q))
-                                            else
-                                                momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q)) &
-                                                         *f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q))
-                                            end if
-
+                                            select case (bubble_model)
+                                            case (3)
+                                                if (j == 3) then
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q))
+                                                else
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q))
+                                                end if
+                                            case (2)
+                                                if ((j >= 7 .and. j <= 9) .or. (j >= 22 .and. j <= 23) .or. (j >= 10 .and. j <= 11) .or. (j == 26)) then
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q))
+                                                else if ((j >= 27 .and. j <= 29) .and. (.not. polytropic)) then
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_mv(:, q), momrhs(:, i1, i2, j, q))
+                                                else if ((j >= 30 .and. j <= 32) .and. (.not. polytropic)) then
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_ht(:, q), momrhs(:, i1, i2, j, q))
+                                                else
+                                                    momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q))
+                                                end if
+                                            end select
                                         end do
-
                                         moms3d(i1, i2, q)%sf(id1, id2, id3) = nbub*momsum
                                         msum(r) = momsum
                                         r = r + 1
-
                                     end if
                                 end do
                             end do
 
-                            ! Compute change in pb and mv for non-polytroic model
+                            ! Compute change in pb and mv for non-polytropic model
                             if (.not. polytropic) then
                                 $:LOOP()
                                 do j = 1, nnode
-                                    ! Compute Rdot (drdt) at quadrature node in the ODE for pb (note this is not the same as bubble variable Rdot)
                                     drdt = msum(2)
-                                    if (moms(4) - moms(2)**2._wp > 0._wp) then
-                                        if (j == 1 .or. j == 2) then
-                                            drdt2 = -1._wp/(2._wp*sqrt(moms(4) - moms(2)**2._wp))
-                                        else
-                                            drdt2 = 1._wp/(2._wp*sqrt(moms(4) - moms(2)**2._wp))
-                                        end if
-                                    else
-                                        ! Edge case where variance < 0
-                                        if (j == 1 .or. j == 2) then
-                                            drdt2 = -1._wp/(2._wp*sqrt(verysmall))
-                                        else
-                                            drdt2 = 1._wp/(2._wp*sqrt(verysmall))
-                                        end if
-                                    end if
-
+                                    drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp)))
                                     drdt2 = drdt2*(msum(3) - 2._wp*moms(2)*msum(2))
                                     drdt = drdt + drdt2
-
                                     rhs_pb(id1, id2, id3, j, q) = (-3._wp*gam*drdt/abscX(j, q))*(pb(id1, id2, id3, j, q))
                                     rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*rhs_mv(id1, id2, id3, j, q)*R_v*Tw
                                     rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*ht(j, q)
                                     rhs_mv(id1, id2, id3, j, q) = rhs_mv(id1, id2, id3, j, q)*(4._wp*pi*abscX(j, q)**2._wp)
                                 end do
-
                             end if
                         end do
 
@@ -996,19 +834,14 @@ contains
                         momsp(2)%sf(id1, id2, id3) = 4._wp*pi*nbub*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp)
                         momsp(3)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght, 3._wp, 2._wp, 0._wp)
                         if (abs(gam - 1._wp) <= 1.e-4_wp) then
-                            ! Gam \approx 1, don't risk imaginary quadrature
                             momsp(4)%sf(id1, id2, id3) = 1._wp
                         else
-                            !Special moment with bubble pressure pb
                             if (polytropic) then
-                                momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp*(1._wp - gam), 0._wp, 3._wp*gam) + pv*f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) &
-                                                             - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp)
+                                momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp*(1._wp - gam), 0._wp, 3._wp*gam) + pv*f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp)
                             else
-                                momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp, 0._wp, 0._wp) &
-                                                             - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp)
+                                momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp)
                             end if
                         end if
-
                     else
                         $:LOOP()
                         do q = 1, nb
@@ -1020,133 +853,141 @@ contains
                                 end do
                             end do
                         end do
-
                         momsp(1)%sf(id1, id2, id3) = 0._wp
                         momsp(2)%sf(id1, id2, id3) = 0._wp
                         momsp(3)%sf(id1, id2, id3) = 0._wp
                         momsp(4)%sf(id1, id2, id3) = 0._wp
-
                     end if
-
                 end do
             end do
         end do
 
-    end subroutine s_mom_inv
+    contains
+        ! Helper to select the correct coefficient routine
+        subroutine s_coeff_selector(pres, rho, c, coeff, polytropic)
+#ifdef _CRAYFTN
+            !DIR$ INLINEALWAYS s_chyqmom
+#else
+            !$acc routine seq
+#endif
+            real(wp), intent(in) :: pres, rho, c
+            real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
+            logical, intent(in) :: polytropic
+            if (polytropic) then
+                call s_coeff(pres, rho, c, coeff)
+            else
+                call s_coeff_nonpoly(pres, rho, c, coeff)
+            end if
+        end subroutine s_coeff_selector
 
-    pure subroutine s_chyqmom(momin, wght, abscX, abscY)
+        pure subroutine s_chyqmom(momin, wght, abscX, abscY)
 #ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_chyqmom
+            !DIR$ INLINEALWAYS s_chyqmom
 #else
-        $:ROUTINE()
+            $:ROUTINE()
 #endif
-        real(wp), dimension(nmom), intent(in) :: momin
-        real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
-
-        real(wp), dimension(0:2, 0:2) :: moms
-        real(wp), dimension(3) :: M1, M3
-        real(wp), dimension(2) :: myrho, myrho3, up, up3, Vf
-        real(wp) :: bu, bv, d20, d11, d_02, c20, c11, c02
-        real(wp) :: mu2avg, mu2, vp21, vp22, rho21, rho22
-
-        moms(0, 0) = momin(1)
-        moms(1, 0) = momin(2)
-        moms(0, 1) = momin(3)
-        moms(2, 0) = momin(4)
-        moms(1, 1) = momin(5)
-        moms(0, 2) = momin(6)
-
-        bu = moms(1, 0)/moms(0, 0)
-        bv = moms(0, 1)/moms(0, 0)
-        d20 = moms(2, 0)/moms(0, 0)
-        d11 = moms(1, 1)/moms(0, 0)
-        d_02 = moms(0, 2)/moms(0, 0)
-
-        c20 = d20 - bu**2._wp; 
-        c11 = d11 - bu*bv; 
-        c02 = d_02 - bv**2._wp; 
-        M1 = (/1._wp, 0._wp, c20/)
-        call s_hyqmom(myrho, up, M1)
-        Vf = c11*up/c20
-
-        mu2avg = c02 - sum(myrho(:)*(Vf(:)**2._wp))
-
-        mu2avg = maxval((/mu2avg, 0._wp/))
-        mu2 = mu2avg
-        M3 = (/1._wp, 0._wp, mu2/)
-        call s_hyqmom(myrho3, up3, M3)
-
-        vp21 = up3(1)
-        vp22 = up3(2)
-        rho21 = myrho3(1)
-        rho22 = myrho3(2)
-
-        wght(1) = myrho(1)*rho21
-        wght(2) = myrho(1)*rho22
-        wght(3) = myrho(2)*rho21
-        wght(4) = myrho(2)*rho22
-        wght = moms(0, 0)*wght
-
-        abscX(1) = up(1)
-        abscX(2) = up(1)
-        abscX(3) = up(2)
-        abscX(4) = up(2)
-        abscX = bu + abscX
-
-        abscY(1) = Vf(1) + vp21
-        abscY(2) = Vf(1) + vp22
-        abscY(3) = Vf(2) + vp21
-        abscY(4) = Vf(2) + vp22
-        abscY = bv + abscY
-
-    end subroutine s_chyqmom
-
-    pure subroutine s_hyqmom(frho, fup, fmom)
+            real(wp), dimension(nmom), intent(in) :: momin
+            real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
+
+            ! Local variables
+            real(wp), dimension(0:2, 0:2) :: moms
+            real(wp), dimension(3) :: M1, M3
+            real(wp), dimension(2) :: myrho, myrho3, up, up3, Vf
+            real(wp) :: bu, bv, d20, d11, d_02, c20, c11, c02
+            real(wp) :: mu2, vp21, vp22, rho21, rho22
+
+            ! Assign moments to 2D array for clarity
+            moms(0, 0) = momin(1)
+            moms(1, 0) = momin(2)
+            moms(0, 1) = momin(3)
+            moms(2, 0) = momin(4)
+            moms(1, 1) = momin(5)
+            moms(0, 2) = momin(6)
+
+            ! Compute means and central moments
+            bu = moms(1, 0)/moms(0, 0)
+            bv = moms(0, 1)/moms(0, 0)
+            d20 = moms(2, 0)/moms(0, 0)
+            d11 = moms(1, 1)/moms(0, 0)
+            d_02 = moms(0, 2)/moms(0, 0)
+
+            c20 = d20 - bu**2._wp
+            c11 = d11 - bu*bv
+            c02 = d_02 - bv**2._wp
+
+            ! First 1D quadrature (X direction)
+            M1 = (/1._wp, 0._wp, c20/)
+            call s_hyqmom(myrho, up, M1)
+            Vf = c11*up/c20
+
+            ! Second 1D quadrature (Y direction, conditional on X)
+            mu2 = max(0._wp, c02 - sum(myrho*(Vf**2._wp)))
+            M3 = (/1._wp, 0._wp, mu2/)
+            call s_hyqmom(myrho3, up3, M3)
+
+            ! Assign roots and weights for 2D quadrature
+            vp21 = up3(1)
+            vp22 = up3(2)
+            rho21 = myrho3(1)
+            rho22 = myrho3(2)
+
+            ! Compute weights (vectorized)
+            wght = moms(0, 0)*[myrho(1)*rho21, myrho(1)*rho22, myrho(2)*rho21, myrho(2)*rho22]
+
+            ! Compute abscissas (vectorized)
+            abscX = bu + [up(1), up(1), up(2), up(2)]
+            abscY = bv + [Vf(1) + vp21, Vf(1) + vp22, Vf(2) + vp21, Vf(2) + vp22]
+
+        end subroutine s_chyqmom
+
+        pure subroutine s_hyqmom(frho, fup, fmom)
 #ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_hyqmom
+            !DIR$ INLINEALWAYS s_hyqmom
 #else
-        $:ROUTINE()
+            $:ROUTINE()
 #endif
-        real(wp), dimension(2), intent(inout) :: frho, fup
-        real(wp), dimension(3), intent(in) :: fmom
+            real(wp), dimension(2), intent(inout) :: frho, fup
+            real(wp), dimension(3), intent(in) :: fmom
 
-        real(wp) :: bu, d2, c2
+            real(wp) :: bu, d2, c2
 
-        bu = fmom(2)/fmom(1)
-        d2 = fmom(3)/fmom(1)
-        c2 = d2 - bu**2._wp
-        frho(1) = fmom(1)/2._wp; 
-        frho(2) = fmom(1)/2._wp; 
-        c2 = maxval((/c2, verysmall/))
-        fup(1) = bu - sqrt(c2)
-        fup(2) = bu + sqrt(c2)
+            bu = fmom(2)/fmom(1)
+            d2 = fmom(3)/fmom(1)
+            c2 = d2 - bu**2._wp
+            frho(1) = fmom(1)/2._wp; 
+            frho(2) = fmom(1)/2._wp; 
+            c2 = maxval((/c2, verysmall/))
+            fup(1) = bu - sqrt(c2)
+            fup(2) = bu + sqrt(c2)
 
-    end subroutine s_hyqmom
+        end subroutine s_hyqmom
 
-    pure function f_quad(abscX, abscY, wght_in, q, r, s)
-        $:ROUTINE()
-        real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
-        real(wp), intent(in) :: q, r, s
+        pure function f_quad(abscX, abscY, wght_in, q, r, s)
+            $:ROUTINE()
+            real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
+            real(wp), intent(in) :: q, r, s
 
-        real(wp) :: f_quad_RV, f_quad
-        integer :: i
+            real(wp) :: f_quad_RV, f_quad
+            integer :: i
 
-        f_quad = 0._wp
-        do i = 1, nb
-            f_quad_RV = sum(wght_in(:, i)*(abscX(:, i)**q)*(abscY(:, i)**r))
-            f_quad = f_quad + weight(i)*(R0(i)**s)*f_quad_RV
-        end do
+            f_quad = 0._wp
+            do i = 1, nb
+                f_quad_RV = sum(wght_in(:, i)*(abscX(:, i)**q)*(abscY(:, i)**r))
+                f_quad = f_quad + weight(i)*(R0(i)**s)*f_quad_RV
+            end do
 
-    end function f_quad
+        end function f_quad
 
-    pure function f_quad2D(abscX, abscY, wght_in, pow)
-        $:ROUTINE()
-        real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
-        real(wp), dimension(3), intent(in) :: pow
+        pure function f_quad2D(abscX, abscY, wght_in, pow)
+            $:ROUTINE()
+            real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
+            real(wp), dimension(3), intent(in) :: pow
+
+            real(wp) :: f_quad2D
 
-        real(wp) :: f_quad2D
+            f_quad2D = sum(wght_in(:)*(abscX(:)**pow(1))*(abscY(:)**pow(2)))
+        end function f_quad2D
 
-        f_quad2D = sum(wght_in(:)*(abscX(:)**pow(1))*(abscY(:)**pow(2)))
-    end function f_quad2D
+    end subroutine s_mom_inv
 
 end module m_qbmm

From 2914bed6bb7f2fe0895746494a208fd0463ae3d4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 4 Jun 2025 21:52:51 -0400
Subject: [PATCH 24/75] Fix packer compare message (#857)

---
 toolchain/mfc/packer/packer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/toolchain/mfc/packer/packer.py b/toolchain/mfc/packer/packer.py
index 98555d8477..0ae6671e45 100644
--- a/toolchain/mfc/packer/packer.py
+++ b/toolchain/mfc/packer/packer.py
@@ -41,7 +41,10 @@ def packer():
         if err is not None:
             raise MFCException(err)
     elif ARG("packer") == "compare":
-        cons.print(f"Comparing [magenta]{os.path.relpath(ARG('input1'))}[/magenta] to [magenta]{os.path.relpath(ARG('input1'))}[/magenta]:")
+        cons.print(
+            f"Comparing [magenta]{os.path.relpath(ARG('input1'))}[/magenta] to "
+            f"[magenta]{os.path.relpath(ARG('input2'))}[/magenta]:"
+        )
 
         cons.indent()
         cons.print()

From d9656a78bf8a0a0df292c81ae613a67c0d7f404c Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 16:08:26 -0400
Subject: [PATCH 25/75] Fixed some OpenACC directives (#859)

---
 src/common/m_finite_differences.fpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 18a0a84484..80db1d8c79 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -18,8 +18,7 @@ contains
         integer :: x, y, z !< Generic loop iterators
 
         real(wp) :: divergence
-
-        $:PARALLEL_LOOP(collapse=3, private=["divergence"])
+        $:PARALLEL_LOOP(collapse=3, gang vector default(present) private=["divergence"])
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end

From 6a348458cc1b585f75dda118425906d3f418ed13 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Jun 2025 17:48:36 -0400
Subject: [PATCH 26/75] cmake flags (#862)

---
 CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5835bd24e..8fa0731f75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,13 +135,18 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(
             -Wall
+            -Wextra	
+            -Wintrinsic-shadow	
+            -Wunderflow	
+            -Wrealloc-lhs	
+            -Wsurprising	
             -fcheck=all,no-array-temps
             -fbacktrace
             -fimplicit-none
-            #-ffpe-trap=invalid,zero,denormal,overflow
             -fsignaling-nans
             -finit-real=snan
             -finit-integer=-99999999
+            #-ffpe-trap=invalid,zero,denormal,overflow
 	    )
     endif()
 

From 217817fcbe5c7b9a44112fd9bc1ece6f0227db16 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Jun 2025 20:42:17 -0400
Subject: [PATCH 27/75] Benchmarking news a new temp directory and there ya go

---
 .github/workflows/phoenix/bench.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 0c2ed163eb..8812e00e3b 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -8,8 +8,13 @@ if [ "$job_device" == "gpu" ]; then
     device_opts="--gpu -g $gpu_ids"
 fi
 
+mkdir -p /storage/scratch1/6/sbryngelson3/mytmp_build
+export TMPDIR=/storage/scratch1/6/sbryngelson3/mytmp_build
+
 if ["$job_device" == "gpu"]; then
     ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 else
     ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 fi
+
+unset TMPDIR

From b443875fcb88d70b0731fa24dc6671009bacd05e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Jun 2025 00:28:57 -0400
Subject: [PATCH 28/75] refac qbmm  (#861)

---
 CMakeLists.txt            | 9 ++++-----
 src/simulation/m_qbmm.fpp | 3 +--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fa0731f75..d486d43981 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,17 +136,16 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
         add_compile_options(
             -Wall
             -Wextra	
-            -Wintrinsic-shadow	
-            -Wunderflow	
-            -Wrealloc-lhs	
-            -Wsurprising	
             -fcheck=all,no-array-temps
             -fbacktrace
             -fimplicit-none
             -fsignaling-nans
             -finit-real=snan
             -finit-integer=-99999999
-            #-ffpe-trap=invalid,zero,denormal,overflow
+            -Wintrinsic-shadow	
+            -Wunderflow	
+            -Wrealloc-lhs	
+            -Wsurprising	
 	    )
     endif()
 
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index c5338bfed0..3136146b4d 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -563,8 +563,7 @@ contains
 
     end subroutine s_compute_qbmm_rhs
 
-!Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv)
-
+    !Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv)
     pure subroutine s_coeff_nonpoly(pres, rho, c, coeffs)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff_nonpoly

From ffb479a25c1e970a24d68f12a270de75274ded46 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 16:10:42 -0400
Subject: [PATCH 29/75] Cody tidying - Remove unused dummy variables (#854)

Co-authored-by: Xuzheng Tian <xtian64@login-phoenix-rh9-3.pace.gatech.edu>
Co-authored-by: Spencer Bryngelson <sbryngelson@gmail.com>
Co-authored-by: Spencer Bryngelson <shb@gatech.edu>
---
 src/common/m_boundary_common.fpp          |  5 +-
 src/common/m_mpi_common.fpp               |  7 +--
 src/common/m_phase_change.fpp             |  9 ++--
 src/common/m_variables_conversion.fpp     | 40 +++++---------
 src/pre_process/m_boundary_conditions.fpp | 15 +++---
 src/pre_process/m_model.fpp               | 16 ++----
 src/pre_process/m_patches.fpp             | 17 +++---
 src/pre_process/m_start_up.fpp            |  5 +-
 src/pre_process/p_main.f90                |  2 +-
 src/simulation/m_bubbles.fpp              |  2 +-
 src/simulation/m_bubbles_EE.fpp           |  3 +-
 src/simulation/m_bubbles_EL.fpp           | 13 ++---
 src/simulation/m_cbc.fpp                  | 21 +++-----
 src/simulation/m_compute_cbc.fpp          | 17 ++----
 src/simulation/m_data_output.fpp          |  8 +--
 src/simulation/m_hyperelastic.fpp         |  2 +-
 src/simulation/m_ibm.fpp                  |  8 +--
 src/simulation/m_qbmm.fpp                 |  6 +--
 src/simulation/m_rhs.fpp                  | 19 +++----
 src/simulation/m_riemann_solvers.fpp      | 66 +++++++----------------
 src/simulation/m_sim_helpers.fpp          |  6 +--
 src/simulation/m_start_up.fpp             | 15 ++----
 src/simulation/m_surface_tension.fpp      |  9 ++--
 src/simulation/m_time_steppers.fpp        | 18 +++----
 src/simulation/m_viscous.fpp              | 12 ++---
 src/simulation/m_weno.fpp                 |  8 ++-
 src/simulation/p_main.fpp                 | 11 ++--
 27 files changed, 124 insertions(+), 236 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index f793c94e4d..d9f6094600 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -143,7 +143,7 @@ contains
                     case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
                         call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 2, -1, k, l)
                     case (BC_AXIS)
-                        call s_axis(q_prim_vf, pb, mv, 2, -1, k, l)
+                        call s_axis(q_prim_vf, pb, mv, k, l)
                     case (BC_REFLECTIVE)
                         call s_symmetry(q_prim_vf, pb, mv, 2, -1, k, l)
                     case (BC_PERIODIC)
@@ -714,7 +714,7 @@ contains
 
     end subroutine s_periodic
 
-    pure subroutine s_axis(q_prim_vf, pb, mv, bc_dir, bc_loc, k, l)
+    pure subroutine s_axis(q_prim_vf, pb, mv, k, l)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_axis
 #else
@@ -722,7 +722,6 @@ contains
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
 
         integer :: j, q, i
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 2d1b079467..1fdc19aa0d 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -376,21 +376,17 @@ contains
         !!  @param Rc_min_glb Global minimum Rc stability criterion
     impure subroutine s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
                                                               vcfl_max_loc, &
-                                                              ccfl_max_loc, &
                                                               Rc_min_loc, &
                                                               icfl_max_glb, &
                                                               vcfl_max_glb, &
-                                                              ccfl_max_glb, &
                                                               Rc_min_glb)
 
         real(wp), intent(in) :: icfl_max_loc
         real(wp), intent(in) :: vcfl_max_loc
-        real(wp), intent(in) :: ccfl_max_loc
         real(wp), intent(in) :: Rc_min_loc
 
         real(wp), intent(out) :: icfl_max_glb
         real(wp), intent(out) :: vcfl_max_glb
-        real(wp), intent(out) :: ccfl_max_glb
         real(wp), intent(out) :: Rc_min_glb
 
 #ifdef MFC_SIMULATION
@@ -1179,7 +1175,8 @@ contains
                 #:endif
             end if
         #:endfor
-
+        p_send => buff_send(0)
+        p_recv => buff_recv(0)
         ! Send/Recv
 #ifdef MFC_SIMULATION
         #:for rdma_mpi in [False, True]
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 7725743e78..5bbb8c4e8c 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -147,7 +147,7 @@ contains
 
                     ! Calling pT-equilibrium for either finishing phase-change module, or as an IC for the pTg-equilibrium
                     ! for this case, MFL cannot be either 0 or 1, so I chose it to be 2
-                    call s_infinite_pt_relaxation_k(j, k, l, 2, pS, p_infpT, rM, q_cons_vf, rhoe, TS)
+                    call s_infinite_pt_relaxation_k(j, k, l, 2, pS, p_infpT, q_cons_vf, rhoe, TS)
 
                     ! check if pTg-equilibrium is required
                     ! NOTE that NOTHING else needs to be updated OTHER than the individual partial densities
@@ -168,7 +168,7 @@ contains
                         q_cons_vf(vp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM
 
                         ! calling pT-equilibrium for overheated vapor, which is MFL = 0
-                        call s_infinite_pt_relaxation_k(j, k, l, 0, pSOV, p_infOV, rM, q_cons_vf, rhoe, TSOV)
+                        call s_infinite_pt_relaxation_k(j, k, l, 0, pSOV, p_infOV, q_cons_vf, rhoe, TSOV)
 
                         ! calculating Saturation temperature
                         call s_TSat(pSOV, TSatOV, TSOV)
@@ -181,7 +181,7 @@ contains
                         q_cons_vf(vp + contxb - 1)%sf(j, k, l) = mixM*rM
 
                         ! calling pT-equilibrium for subcooled liquid, which is MFL = 1
-                        call s_infinite_pt_relaxation_k(j, k, l, 1, pSSL, p_infSL, rM, q_cons_vf, rhoe, TSSL)
+                        call s_infinite_pt_relaxation_k(j, k, l, 1, pSSL, p_infSL, q_cons_vf, rhoe, TSSL)
 
                         ! calculating Saturation temperature
                         call s_TSat(pSSL, TSatSL, TSSL)
@@ -285,7 +285,7 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param rhoe mixture energy
         !!  @param TS equilibrium temperature at the interface
-    pure subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, rM, q_cons_vf, rhoe, TS)
+    pure subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS)
 
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
@@ -297,7 +297,6 @@ contains
         integer, intent(in) :: j, k, l, MFL
         real(wp), intent(out) :: pS
         real(wp), dimension(num_fluids), intent(out) :: p_infpT
-        real(wp), intent(in) :: rM
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         real(wp), intent(in) :: rhoe
         real(wp), intent(out) :: TS
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 796dcb5003..a23b5ff2ae 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -92,11 +92,11 @@ contains
 
         if (model_eqns == 1) then        ! Gamma/pi_inf model
             call s_convert_mixture_to_mixture_variables(q_vf, i, j, k, &
-                                                        rho, gamma, pi_inf, qv, Re_K, G_K, G)
+                                                        rho, gamma, pi_inf, qv)
 
         else if (bubbles_euler) then
             call s_convert_species_to_mixture_variables_bubbles(q_vf, i, j, k, &
-                                                                rho, gamma, pi_inf, qv, Re_K, G_K, G)
+                                                                rho, gamma, pi_inf, qv, Re_K)
         else
             ! Volume fraction model
             call s_convert_species_to_mixture_variables(q_vf, i, j, k, &
@@ -207,7 +207,7 @@ contains
         !! @param pi_inf liquid stiffness
         !! @param qv fluid reference energy
     subroutine s_convert_mixture_to_mixture_variables(q_vf, i, j, k, &
-                                                      rho, gamma, pi_inf, qv, Re_K, G_K, G)
+                                                      rho, gamma, pi_inf, qv)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_vf
         integer, intent(in) :: i, j, k
@@ -217,11 +217,6 @@ contains
         real(wp), intent(out), target :: pi_inf
         real(wp), intent(out), target :: qv
 
-        real(wp), optional, dimension(2), intent(out) :: Re_K
-
-        real(wp), optional, intent(out) :: G_K
-        real(wp), optional, dimension(num_fluids), intent(in) :: G
-
         ! Transferring the density, the specific heat ratio function and the
         ! liquid stiffness function, respectively
         rho = q_vf(1)%sf(i, j, k)
@@ -254,7 +249,7 @@ contains
         !! @param pi_inf liquid stiffness
         !! @param qv fluid reference energy
     subroutine s_convert_species_to_mixture_variables_bubbles(q_vf, j, k, l, &
-                                                              rho, gamma, pi_inf, qv, Re_K, G_K, G)
+                                                              rho, gamma, pi_inf, qv, Re_K)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_vf
 
@@ -266,8 +261,6 @@ contains
         real(wp), intent(out), target :: qv
 
         real(wp), optional, dimension(2), intent(out) :: Re_K
-        real(wp), optional, intent(out) :: G_K
-        real(wp), optional, dimension(num_fluids), intent(in) :: G
 
         integer :: i, q
         real(wp), dimension(num_fluids) :: alpha_rho_K, alpha_K
@@ -464,7 +457,7 @@ contains
 
     pure subroutine s_convert_species_to_mixture_variables_acc(rho_K, &
                                                                gamma_K, pi_inf_K, qv_K, &
-                                                               alpha_K, alpha_rho_K, Re_K, k, l, r, &
+                                                               alpha_K, alpha_rho_K, Re_K, &
                                                                G_K, G)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
@@ -481,8 +474,6 @@ contains
         real(wp), optional, intent(out) :: G_K
         real(wp), optional, dimension(num_fluids), intent(in) :: G
 
-        integer, intent(in) :: k, l, r
-
         integer :: i, j !< Generic loop iterators
         real(wp) :: alpha_K_sum
 
@@ -548,7 +539,7 @@ contains
 
     pure subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, &
                                                                        gamma_K, pi_inf_K, qv_K, &
-                                                                       alpha_K, alpha_rho_K, Re_K, k, l, r)
+                                                                       alpha_K, alpha_rho_K, Re_K)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
 #else
@@ -561,7 +552,6 @@ contains
             !! Partial densities and volume fractions
 
         real(wp), dimension(2), intent(out) :: Re_K
-        integer, intent(in) :: k, l, r
 
         integer :: i, j !< Generic loop iterators
 
@@ -820,16 +810,12 @@ contains
     subroutine s_convert_conservative_to_primitive_variables(qK_cons_vf, &
                                                              q_T_sf, &
                                                              qK_prim_vf, &
-                                                             ibounds, &
-                                                             gm_alphaK_vf)
+                                                             ibounds)
 
         type(scalar_field), dimension(sys_size), intent(in) :: qK_cons_vf
         type(scalar_field), intent(inout) :: q_T_sf
         type(scalar_field), dimension(sys_size), intent(inout) :: qK_prim_vf
         type(int_bounds_info), dimension(1:3), intent(in) :: ibounds
-        type(scalar_field), &
-            allocatable, optional, dimension(:), &
-            intent(in) :: gm_alphaK_vf
 
         real(wp), dimension(num_fluids) :: alpha_K, alpha_rho_K
         real(wp), dimension(2) :: Re_K
@@ -903,13 +889,13 @@ contains
                         ! If in simulation, use acc mixture subroutines
                         if (elasticity) then
                             call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, alpha_K, &
-                                                                            alpha_rho_K, Re_K, j, k, l, G_K, Gs)
+                                                                            alpha_rho_K, Re_K, G_K, Gs)
                         else if (bubbles_euler) then
                             call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, pi_inf_K, qv_K, &
-                                                                                    alpha_K, alpha_rho_K, Re_K, j, k, l)
+                                                                                    alpha_K, alpha_rho_K, Re_K)
                         else
                             call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, &
-                                                                            alpha_K, alpha_rho_K, Re_K, j, k, l)
+                                                                            alpha_K, alpha_rho_K, Re_K)
                         end if
 #else
                         ! If pre-processing, use non acc mixture subroutines
@@ -1511,13 +1497,13 @@ contains
                     if (elasticity) then
                         call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, &
                                                                         alpha_K, alpha_rho_K, Re_K, &
-                                                                        j, k, l, G_K, Gs)
+                                                                        G_K, Gs)
                     else if (bubbles_euler) then
                         call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, &
-                                                                                pi_inf_K, qv_K, alpha_K, alpha_rho_K, Re_K, j, k, l)
+                                                                                pi_inf_K, qv_K, alpha_K, alpha_rho_K, Re_K)
                     else
                         call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, &
-                                                                        alpha_K, alpha_rho_K, Re_K, j, k, l)
+                                                                        alpha_K, alpha_rho_K, Re_K)
                     end if
 
                     ! Computing the energy from the pressure
diff --git a/src/pre_process/m_boundary_conditions.fpp b/src/pre_process/m_boundary_conditions.fpp
index 1e8f160b38..4fc2e5ff03 100644
--- a/src/pre_process/m_boundary_conditions.fpp
+++ b/src/pre_process/m_boundary_conditions.fpp
@@ -29,9 +29,8 @@ module m_boundary_conditions
  s_write_parallel_boundary_condition_files
 
 contains
-    impure subroutine s_line_segment_bc(patch_id, q_prim_vf, bc_type)
+    impure subroutine s_line_segment_bc(patch_id, bc_type)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(integer_field), dimension(1:num_dims, -1:1), intent(inout) :: bc_type
         integer, intent(in) :: patch_id
 
@@ -79,9 +78,8 @@ contains
 
     end subroutine s_line_segment_bc
 
-    impure subroutine s_circle_bc(patch_id, q_prim_vf, bc_type)
+    impure subroutine s_circle_bc(patch_id, bc_type)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(integer_field), dimension(1:num_dims, -1:1), intent(inout) :: bc_type
 
         integer, intent(in) :: patch_id
@@ -143,9 +141,8 @@ contains
 
     end subroutine s_circle_bc
 
-    impure subroutine s_rectangle_bc(patch_id, q_prim_vf, bc_type)
+    impure subroutine s_rectangle_bc(patch_id, bc_type)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(integer_field), dimension(1:num_dims, -1:1), intent(inout) :: bc_type
 
         integer, intent(in) :: patch_id
@@ -247,9 +244,9 @@ contains
                 end if
 
                 if (patch_bc(i)%geometry == 2) then
-                    call s_circle_bc(i, q_prim_vf, bc_type)
+                    call s_circle_bc(i, bc_type)
                 elseif (patch_bc(i)%geometry == 3) then
-                    call s_rectangle_bc(i, q_prim_vf, bc_type)
+                    call s_rectangle_bc(i, bc_type)
                 end if
             end do
             !< Apply 1D patches to 2D domain
@@ -260,7 +257,7 @@ contains
                 end if
 
                 if (patch_bc(i)%geometry == 1) then
-                    call s_line_segment_bc(i, q_prim_vf, bc_type)
+                    call s_line_segment_bc(i, bc_type)
                 end if
             end do
         end if
diff --git a/src/pre_process/m_model.fpp b/src/pre_process/m_model.fpp
index 214a5fcda1..852cc1a22b 100644
--- a/src/pre_process/m_model.fpp
+++ b/src/pre_process/m_model.fpp
@@ -1104,11 +1104,10 @@ contains
     !! @param point                        The cell centers of the current levelset cell
     !! @param spacing                      Dimensions of the current levelset cell
     !! @return                             Distance which the levelset distance without interpolation
-    pure function f_distance(boundary_v, boundary_vertex_count, boundary_edge_count, point, spacing) result(distance)
-        integer, intent(in) :: boundary_vertex_count, boundary_edge_count
+    pure function f_distance(boundary_v, boundary_edge_count, point) result(distance)
+        integer, intent(in) :: boundary_edge_count
         real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v
         t_vec3, intent(in) :: point
-        t_vec3, intent(in) :: spacing
 
         integer :: i
         real(wp) :: dist_buffer1, dist_buffer2
@@ -1132,16 +1131,13 @@ contains
 
     !> This procedure determines the levelset normals of 2D models without interpolation.
     !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
-    !! @param boundary_vertex_count        Output the total number of boundary vertices
     !! @param boundary_edge_count          Output the total number of boundary edges
     !! @param point                        The cell centers of the current levelset cell
-    !! @param spacing                      Dimensions of the current levelset cell
     !! @param normals                      Output levelset normals without interpolation
-    pure subroutine f_normals(boundary_v, boundary_vertex_count, boundary_edge_count, point, spacing, normals)
-        integer, intent(in) :: boundary_vertex_count, boundary_edge_count
+    pure subroutine f_normals(boundary_v, boundary_edge_count, point, normals)
+        integer, intent(in) :: boundary_edge_count
         real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v
         t_vec3, intent(in) :: point
-        t_vec3, intent(in) :: spacing
         t_vec3, intent(out) :: normals
 
         integer :: i, idx_buffer
@@ -1176,13 +1172,11 @@ contains
     !! @param interpolated_boundary_v      Group of all the boundary vertices of the interpolated 2D model
     !! @param total_vertices               Total number of vertices after interpolation
     !! @param point                        The cell centers of the current levelset cell
-    !! @param spacing                      Dimensions of the current levelset cell
     !! @return                             Distance which the levelset distance without interpolation
-    pure function f_interpolated_distance(interpolated_boundary_v, total_vertices, point, spacing) result(distance)
+    pure function f_interpolated_distance(interpolated_boundary_v, total_vertices, point) result(distance)
         integer, intent(in) :: total_vertices
         real(wp), intent(in), dimension(1:total_vertices, 1:3) :: interpolated_boundary_v
         t_vec3, intent(in) :: point
-        t_vec3, intent(in) :: spacing
 
         integer :: i !< Loop iterator
         real(wp) :: dist_buffer, min_dist
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index 584048aa02..fe861c4c06 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -2300,8 +2300,7 @@ contains
                             if (interpolate) then
                                 STL_levelset%sf(i, j, k, patch_id) = f_interpolated_distance(interpolated_boundary_v, &
                                                                                              total_vertices, &
-                                                                                             point, &
-                                                                                             (/dx, dy, dz/))
+                                                                                             point)
                             else
                                 STL_levelset%sf(i, j, k, patch_id) = distance
                             end if
@@ -2324,15 +2323,12 @@ contains
                                 ! Get the shortest distance between the cell center and the model boundary
                                 STL_levelset%sf(i, j, 0, patch_id) = f_interpolated_distance(interpolated_boundary_v, &
                                                                                              total_vertices, &
-                                                                                             point, &
-                                                                                             (/dx, dy, dz/))
+                                                                                             point)
                             else
                                 ! Get the shortest distance between the cell center and the interpolated model boundary
                                 STL_levelset%sf(i, j, 0, patch_id) = f_distance(boundary_v, &
-                                                                                boundary_vertex_count, &
                                                                                 boundary_edge_count, &
-                                                                                point, &
-                                                                                (/dx, dy, dz/))
+                                                                                point)
                             end if
 
                             ! Correct the sign of the levelset
@@ -2342,10 +2338,9 @@ contains
 
                             ! Get the boundary normals
                             call f_normals(boundary_v, &
-                                            boundary_vertex_count, &
-                                            boundary_edge_count, &
-                                            point, &
-                                            & (/dx, dy, dz/), normals)
+                                           boundary_edge_count, &
+                                           point, &
+                                           normals)
 
                             ! Correct the sign of the levelset_norm
                             if (patch_id_fp(i, j, k) == 0) then
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index 1c9c79d342..e33491576f 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -826,12 +826,9 @@ contains
 
     end subroutine s_read_grid
 
-    impure subroutine s_apply_initial_condition(start, finish, proc_time, time_avg, time_final, file_exists)
+    impure subroutine s_apply_initial_condition(start, finish)
 
         real(wp), intent(inout) :: start, finish
-        real(wp), dimension(:), intent(inout) :: proc_time
-        real(wp), intent(inout) :: time_avg, time_final
-        logical, intent(inout) :: file_exists
 
         ! Setting up the grid and the initial condition. If the grid is read in from
         ! preexisting grid data files, it is checked for consistency. If the grid is
diff --git a/src/pre_process/p_main.f90 b/src/pre_process/p_main.f90
index 56e6ab928f..4ce377f5cb 100644
--- a/src/pre_process/p_main.f90
+++ b/src/pre_process/p_main.f90
@@ -28,7 +28,7 @@ program p_main
 
     allocate (proc_time(0:num_procs - 1))
 
-    call s_apply_initial_condition(start, finish, proc_time, time_avg, time_final, file_exists)
+    call s_apply_initial_condition(start, finish)
 
     time_avg = abs(finish - start)
 
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 11a7c075d2..54fc7fcc47 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -70,7 +70,7 @@ contains
         else if (bubble_model == 3) then
             ! Rayleigh-Plesset bubbles
             fCpbw = f_cpbw_KM(fR0, fR, fV, fpb)
-            f_rddot = f_rddot_RP(fP, fRho, fR, fV, fR0, fCpbw)
+            f_rddot = f_rddot_RP(fP, fRho, fR, fV, fCpbw)
         end if
 
     end function f_rddot
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 714f95913c..d6c2b0212c 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -154,10 +154,9 @@ contains
         !!      that are needed for the bubble modeling
         !!  @param q_prim_vf Primitive variables
         !!  @param q_cons_vf Conservative variables
-    impure subroutine s_compute_bubble_EE_source(q_cons_vf, q_prim_vf, t_step, rhs_vf)
+    impure subroutine s_compute_bubble_EE_source(q_cons_vf, q_prim_vf, rhs_vf)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        integer, intent(in) :: t_step
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
 
         real(wp) :: rddot
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index a1746ddac6..c455426180 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -503,12 +503,10 @@ contains
         !! @param rhs_vf Calculated change of conservative variables
         !! @param t_step Current time step
         !! @param stage Current stage in the time-stepper algorithm
-    subroutine s_compute_bubble_EL_dynamics(q_cons_vf, q_prim_vf, t_step, rhs_vf, stage)
+    subroutine s_compute_bubble_EL_dynamics(q_prim_vf, stage)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
-        integer, intent(in) :: t_step, stage
+        integer, intent(in) :: stage
 
         real(wp) :: myVapFlux
         real(wp) :: preterm1, term2, paux, pint, Romega, term1_fac
@@ -582,8 +580,8 @@ contains
                 myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3))
             end do
             call s_convert_species_to_mixture_variables_acc(myRho, gamma, pi_inf, qv, myalpha, &
-                                                            myalpha_rho, Re, cell(1), cell(2), cell(3))
-            call s_compute_cson_from_pinf(k, q_prim_vf, myPinf, cell, myRho, gamma, pi_inf, myCson)
+                                                            myalpha_rho, Re)
+            call s_compute_cson_from_pinf(q_prim_vf, myPinf, cell, myRho, gamma, pi_inf, myCson)
 
             ! Adaptive time stepping
             adap_dt_stop = 0
@@ -740,13 +738,12 @@ contains
         !! @param gamma Liquid specific heat ratio
         !! @param pi_inf Liquid stiffness
         !! @param cson Calculated speed of sound
-    pure subroutine s_compute_cson_from_pinf(bub_id, q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson)
+    pure subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_cson_from_pinf
 #else
         $:ROUTINE()
 #endif
-        integer, intent(in) :: bub_id
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
         integer, dimension(3), intent(in) :: cell
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index e306717513..6b19251034 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -808,9 +808,9 @@ contains
                         end do
 
                         if (bubbles_euler) then
-                            call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc, 0, k, r)
+                            call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc)
                         else
-                            call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc, 0, k, r)
+                            call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc)
                         end if
 
                         $:LOOP()
@@ -920,13 +920,13 @@ contains
 
                         if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SLIP_WALL) .or. &
                             (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SLIP_WALL)) then
-                            call s_compute_slip_wall_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+                            call s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds)
                         else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_BUFFER) .or. &
                                  (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_BUFFER)) then
                             call s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
                         else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_INFLOW) .or. &
                                  (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_INFLOW)) then
-                            call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+                            call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
                             ! Add GRCBC for Subsonic Inflow
                             if (bc_${XYZ}$%grcbc_in) then
                                 $:LOOP()
@@ -965,7 +965,7 @@ contains
                             call s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
                         else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_INFLOW) .or. &
                                  (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_INFLOW)) then
-                            call s_compute_supersonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+                            call s_compute_supersonic_inflow_L(L)
                         else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_OUTFLOW) .or. &
                                  (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_OUTFLOW)) then
                             call s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
@@ -1118,8 +1118,7 @@ contains
         ! The reshaping of outputted data and disssociation of the FD and PI
         ! coefficients, or CBC coefficients, respectively, based on selected
         ! CBC coordinate direction.
-        call s_finalize_cbc(flux_vf, flux_src_vf, &
-                            ix, iy, iz)
+        call s_finalize_cbc(flux_vf, flux_src_vf)
     end subroutine s_cbc
 
     !>  The computation of parameters, the allocation of memory,
@@ -1406,18 +1405,12 @@ contains
         !!      are necessary in order to finalize the CBC application
         !!  @param flux_vf Cell-boundary-average fluxes
         !!  @param flux_src_vf Cell-boundary-average flux sources
-        !!  @param ix Index bound in the first coordinate direction
-        !!  @param iy Index bound in the second coordinate direction
-        !!  @param iz Index bound in the third coordinate direction
-    subroutine s_finalize_cbc(flux_vf, flux_src_vf, &
-                              ix, iy, iz)
+    subroutine s_finalize_cbc(flux_vf, flux_src_vf)
 
         type(scalar_field), &
             dimension(sys_size), &
             intent(inout) :: flux_vf, flux_src_vf
 
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
         integer :: i, j, k, r !< Generic loop iterators
 
         ! Determining the indicial shift based on CBC location
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 23136a0198..421f4ab07a 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -24,7 +24,7 @@ contains
         !!      Thompson (1990). At the slip wall (frictionless wall),
         !!      the normal component of velocity is zero at all times,
         !!      while the transverse velocities may be nonzero.
-    pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+    pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
 #else
@@ -33,10 +33,8 @@ contains
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
 
         integer :: i
 
@@ -104,7 +102,7 @@ contains
         !!      see pg. 455, Thompson (1990). This nonreflecting subsonic
         !!      CBC assumes an incoming flow and reduces the amplitude of
         !!      any reflections caused by outgoing waves.
-    pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+    pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
 #else
@@ -113,10 +111,8 @@ contains
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
 
         integer :: i
 
@@ -264,19 +260,14 @@ contains
         !!      state, or nearly a steady state, CBC in which only the
         !!      transverse terms may generate a time dependence at the
         !!      inflow boundary.
-    pure subroutine s_compute_supersonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
+    pure subroutine s_compute_supersonic_inflow_L(L)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
 #else
         $:ROUTINE()
 #endif
-        real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+
         integer :: i
 
         do i = 1, advxe
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index e8f4c92f06..fdf81d528a 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -99,7 +99,7 @@ contains
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta)
+            call s_write_parallel_data_files(q_cons_vf, t_step, beta)
         end if
 
     end subroutine s_write_data_files
@@ -332,11 +332,9 @@ contains
         if (num_procs > 1) then
             call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
                                                          vcfl_max_loc, &
-                                                         ccfl_max_loc, &
                                                          Rc_min_loc, &
                                                          icfl_max_glb, &
                                                          vcfl_max_glb, &
-                                                         ccfl_max_glb, &
                                                          Rc_min_glb)
         else
             icfl_max_glb = icfl_max_loc
@@ -785,13 +783,11 @@ contains
     !>  The goal of this subroutine is to output the grid and
         !!      conservative variables data files for given time-step.
         !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param q_prim_vf Cell-average primitive variables
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
-    impure subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta)
+    impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, beta)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
 
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index efe984ef05..994412a567 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -119,7 +119,7 @@ contains
                     end do
                     ! If in simulation, use acc mixture subroutines
                     call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha_k, &
-                                                                    alpha_rho_k, Re, j, k, l, G, Gs)
+                                                                    alpha_rho_k, Re, G, Gs)
                     rho = max(rho, sgm_eps)
                     G = max(G, sgm_eps)
                     !if ( G <= verysmall ) G_K = 0_wp
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index f4b83f9598..1db453b8a0 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -208,13 +208,13 @@ contains
                 ! If in simulation, use acc mixture subroutines
                 if (elasticity) then
                     call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, &
-                                                                    alpha_rho_IP, Re_K, j, k, l, G_K, Gs)
+                                                                    alpha_rho_IP, Re_K, G_K, Gs)
                 else if (bubbles_euler) then
                     call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv_K, alpha_IP, &
-                                                                            alpha_rho_IP, Re_K, j, k, l)
+                                                                            alpha_rho_IP, Re_K)
                 else
                     call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, &
-                                                                    alpha_rho_IP, Re_K, j, k, l)
+                                                                    alpha_rho_IP, Re_K)
                 end if
             end if
 
@@ -331,7 +331,7 @@ contains
             end if
 
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, &
-                                                            alpha_rho_IP, Re_K, j, k, l)
+                                                            alpha_rho_IP, Re_K)
 
             dyn_pres = 0._wp
 
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 3136146b4d..72a0114865 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -412,14 +412,13 @@ contains
 
     end subroutine s_initialize_qbmm_module
 
-    pure subroutine s_compute_qbmm_rhs(idir, q_cons_vf, q_prim_vf, rhs_vf, flux_n_vf, pb, rhs_pb, mv, rhs_mv)
+    pure subroutine s_compute_qbmm_rhs(idir, q_cons_vf, q_prim_vf, rhs_vf, flux_n_vf, pb, rhs_pb)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf, q_prim_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(scalar_field), dimension(sys_size), intent(in) :: flux_n_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, rhs_pb
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
 
         integer :: i, j, k, l, q
         real(wp) :: nb_q, nb_dot, R, R2, nR, nR2, nR_dot, nR2_dot, var, AX
@@ -700,7 +699,7 @@ contains
 
     end subroutine s_coeff
 
-    subroutine s_mom_inv(q_cons_vf, q_prim_vf, momsp, moms3d, pb, rhs_pb, mv, rhs_mv, ix, iy, iz, nbub_sc)
+    subroutine s_mom_inv(q_cons_vf, q_prim_vf, momsp, moms3d, pb, rhs_pb, mv, rhs_mv, ix, iy, iz)
 
         type(scalar_field), dimension(:), intent(inout) :: q_cons_vf, q_prim_vf
         type(scalar_field), dimension(:), intent(inout) :: momsp
@@ -708,7 +707,6 @@ contains
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, rhs_pb
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
         type(int_bounds_info), intent(in) :: ix, iy, iz
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:), intent(inout) :: nbub_sc
 
         real(wp), dimension(nmom) :: moms, msum
         real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index c31095f6ca..846f04370a 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -670,8 +670,7 @@ contains
             q_cons_qp%vf, &
             q_T_sf, &
             q_prim_qp%vf, &
-            idwint, &
-            gm_alpha_qp%vf)
+            idwint)
         call nvtxEndRange
 
         call nvtxStartRange("RHS-COMMUNICATION")
@@ -688,7 +687,7 @@ contains
             if (t_step == t_step_stop) return
         end if
 
-        if (qbmm) call s_mom_inv(q_cons_qp%vf, q_prim_qp%vf, mom_sp, mom_3d, pb, rhs_pb, mv, rhs_mv, idwbuff(1), idwbuff(2), idwbuff(3), nbub)
+        if (qbmm) call s_mom_inv(q_cons_qp%vf, q_prim_qp%vf, mom_sp, mom_3d, pb, rhs_pb, mv, rhs_mv, idwbuff(1), idwbuff(2), idwbuff(3))
 
         if (viscous) then
             call nvtxStartRange("RHS-VISCOUS")
@@ -853,9 +852,7 @@ contains
                                         rhs_vf, &
                                         flux_n(id)%vf, &
                                         pb, &
-                                        rhs_pb, &
-                                        mv, &
-                                        rhs_mv)
+                                        rhs_pb)
                 call nvtxEndRange
             end if
             ! END: Additional physics and source terms
@@ -899,7 +896,6 @@ contains
             call s_compute_bubble_EE_source( &
                 q_cons_qp%vf(1:sys_size), &
                 q_prim_qp%vf(1:sys_size), &
-                t_step, &
                 rhs_vf)
             call nvtxEndRange
         end if
@@ -916,10 +912,7 @@ contains
             if (.not. adap_dt) then
                 call nvtxStartRange("RHS-EL-BUBBLES-DYN")
                 call s_compute_bubble_EL_dynamics( &
-                    q_cons_qp%vf(1:sys_size), &
                     q_prim_qp%vf(1:sys_size), &
-                    t_step, &
-                    rhs_vf, &
                     stage)
                 call nvtxEndRange
             end if
@@ -1957,19 +1950,19 @@ contains
 
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1, is2, is3)
             else
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1, is2, is3)
             end if
         else
 
             call s_weno(v_vf(iv%beg:iv%end), &
                         vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
-                        norm_dir, weno_dir, &
+                        weno_dir, &
                         is1, is2, is3)
         end if
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 1f52925f00..6d0c7fb7d8 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -242,17 +242,14 @@ contains
                                                            norm_dir, &
                                                            ix, iy, iz)
         else
-            call s_compute_cartesian_viscous_source_flux(velL_vf, &
-                                                         dvelL_dx_vf, &
+            call s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
                                                          dvelL_dy_vf, &
                                                          dvelL_dz_vf, &
-                                                         velR_vf, &
                                                          dvelR_dx_vf, &
                                                          dvelR_dy_vf, &
                                                          dvelR_dz_vf, &
                                                          flux_src_vf, &
-                                                         norm_dir, &
-                                                         ix, iy, iz)
+                                                         norm_dir)
         end if
     end subroutine s_compute_viscous_source_flux
 
@@ -346,19 +343,15 @@ contains
             qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
             dqL_prim_dy_vf, &
             dqL_prim_dz_vf, &
-            qL_prim_vf, &
             qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
             dqR_prim_dy_vf, &
             dqR_prim_dz_vf, &
-            qR_prim_vf, &
             norm_dir, ix, iy, iz)
 
         ! Reshaping inputted data based on dimensional splitting direction
         call s_initialize_riemann_solver( &
-            q_prim_vf, &
-            flux_vf, flux_src_vf, &
-            flux_gsrc_vf, &
-            norm_dir, ix, iy, iz)
+            flux_src_vf, &
+            norm_dir)
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
@@ -1138,7 +1131,7 @@ contains
 
         call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
                                        flux_gsrc_vf, &
-                                       norm_dir, ix, iy, iz)
+                                       norm_dir)
 
     end subroutine s_hll_riemann_solver
 
@@ -1271,20 +1264,16 @@ contains
             qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
             dqL_prim_dy_vf, &
             dqL_prim_dz_vf, &
-            qL_prim_vf, &
             qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
             dqR_prim_dy_vf, &
             dqR_prim_dz_vf, &
-            qR_prim_vf, &
             norm_dir, ix, iy, iz)
 
         ! Reshaping inputted data based on dimensional splitting direction
 
         call s_initialize_riemann_solver( &
-            q_prim_vf, &
-            flux_vf, flux_src_vf, &
-            flux_gsrc_vf, &
-            norm_dir, ix, iy, iz)
+            flux_src_vf, &
+            norm_dir)
 
         idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3
 
@@ -2998,7 +2987,6 @@ contains
 
         if (surface_tension) then
             call s_compute_capilary_source_flux( &
-                q_prim_vf, &
                 vel_src_rsx_vf, &
                 vel_src_rsy_vf, &
                 vel_src_rsz_vf, &
@@ -3008,7 +2996,7 @@ contains
 
         call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
                                        flux_gsrc_vf, &
-                                       norm_dir, ix, iy, iz)
+                                       norm_dir)
 
     end subroutine s_hllc_riemann_solver
 
@@ -3068,13 +3056,13 @@ contains
 
         call s_populate_riemann_states_variables_buffers( &
             qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-            dqL_prim_dy_vf, dqL_prim_dz_vf, qL_prim_vf, &
+            dqL_prim_dy_vf, dqL_prim_dz_vf, &
             qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-            dqR_prim_dy_vf, dqR_prim_dz_vf, qR_prim_vf, &
+            dqR_prim_dy_vf, dqR_prim_dz_vf, &
             norm_dir, ix, iy, iz)
 
         call s_initialize_riemann_solver( &
-            q_prim_vf, flux_vf, flux_src_vf, flux_gsrc_vf, norm_dir, ix, iy, iz)
+            flux_src_vf, norm_dir)
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
@@ -3317,7 +3305,7 @@ contains
         #:endfor
 
         call s_finalize_riemann_solver(flux_vf, flux_src_vf, flux_gsrc_vf, &
-                                       norm_dir, ix, iy, iz)
+                                       norm_dir)
     end subroutine s_hlld_riemann_solver
 
     !>  The computation of parameters, the allocation of memory,
@@ -3464,11 +3452,9 @@ contains
         qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
         dqL_prim_dy_vf, &
         dqL_prim_dz_vf, &
-        qL_prim_vf, &
         qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
         dqR_prim_dy_vf, &
         dqR_prim_dz_vf, &
-        qR_prim_vf, &
         norm_dir, ix, iy, iz)
 
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
@@ -3477,8 +3463,7 @@ contains
             allocatable, dimension(:), &
             intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
                              dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf, &
-                             qL_prim_vf, qR_prim_vf
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
 
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
@@ -3845,18 +3830,14 @@ contains
         !!  @param iz Index bounds in the z-dir
         !!  @param q_prim_vf Cell-averaged primitive variables
     subroutine s_initialize_riemann_solver( &
-        q_prim_vf, &
-        flux_vf, flux_src_vf, &
-        flux_gsrc_vf, &
-        norm_dir, ix, iy, iz)
+        flux_src_vf, &
+        norm_dir)
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         type(scalar_field), &
             dimension(sys_size), &
-            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+            intent(inout) :: flux_src_vf
 
         integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
 
         integer :: i, j, k, l ! Generic loop iterators
 
@@ -4133,26 +4114,21 @@ contains
     !! @param[in] ix X-direction loop bounds (int_bounds_info).
     !! @param[in] iy Y-direction loop bounds (int_bounds_info).
     !! @param[in] iz Z-direction loop bounds (int_bounds_info).
-    pure subroutine s_compute_cartesian_viscous_source_flux(velL_vf, &
-                                                            dvelL_dx_vf, &
+    pure subroutine s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
                                                             dvelL_dy_vf, &
                                                             dvelL_dz_vf, &
-                                                            velR_vf, &
                                                             dvelR_dx_vf, &
                                                             dvelR_dy_vf, &
                                                             dvelR_dz_vf, &
                                                             flux_src_vf, &
-                                                            norm_dir, &
-                                                            ix, iy, iz)
+                                                            norm_dir)
 
         ! Arguments
-        type(scalar_field), dimension(num_dims), intent(in) :: velL_vf, velR_vf
         type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf
         type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dy_vf, dvelR_dy_vf
         type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dz_vf, dvelR_dz_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: flux_src_vf
         integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
 
         ! Local variables
         real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
@@ -4326,19 +4302,15 @@ contains
         !!  @param flux_src_vf   Intercell source fluxes
         !!  @param flux_gsrc_vf  Intercell geometric source fluxes
         !!  @param norm_dir Dimensional splitting coordinate direction
-        !!  @param ix   Index bounds in  first coordinate direction
-        !!  @param iy   Index bounds in second coordinate direction
-        !!  @param iz   Index bounds in  third coordinate direction
     pure subroutine s_finalize_riemann_solver(flux_vf, flux_src_vf, &
                                               flux_gsrc_vf, &
-                                              norm_dir, ix, iy, iz)
+                                              norm_dir)
 
         type(scalar_field), &
             dimension(sys_size), &
             intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
 
         integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
 
         integer :: i, j, k, l !< Generic loop iterators
 
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 4f51c59898..5dc7ee30a7 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -57,11 +57,11 @@ contains
 
         if (elasticity) then
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, &
-                                                            alpha_rho, Re, j, k, l, G, Gs)
+                                                            alpha_rho, Re, G, Gs)
         elseif (bubbles_euler) then
-            call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re, j, k, l)
+            call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re)
         else
-            call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re, j, k, l)
+            call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re)
         end if
 
         $:LOOP()
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 208f0524e6..bd4ea50666 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1240,15 +1240,9 @@ contains
 
     end subroutine s_initialize_internal_energy_equations
 
-    impure subroutine s_perform_time_step(t_step, time_avg, time_final, io_time_avg, io_time_final, proc_time, io_proc_time, file_exists, start, finish, nt)
+    impure subroutine s_perform_time_step(t_step, time_avg)
         integer, intent(inout) :: t_step
-        real(wp), intent(inout) :: time_avg, time_final
-        real(wp), intent(inout) :: io_time_avg, io_time_final
-        real(wp), dimension(:), intent(inout) :: proc_time
-        real(wp), dimension(:), intent(inout) :: io_proc_time
-        logical, intent(inout) :: file_exists
-        real(wp), intent(inout) :: start, finish
-        integer, intent(inout) :: nt
+        real(wp), intent(inout) :: time_avg
 
 
         integer :: i
@@ -1330,16 +1324,13 @@ contains
 
     end subroutine s_perform_time_step
 
-    impure subroutine s_save_performance_metrics(t_step, time_avg, time_final, io_time_avg, io_time_final, proc_time, io_proc_time, file_exists, start, finish, nt)
+    impure subroutine s_save_performance_metrics(time_avg, time_final, io_time_avg, io_time_final, proc_time, io_proc_time, file_exists)
 
-        integer, intent(inout) :: t_step
         real(wp), intent(inout) :: time_avg, time_final
         real(wp), intent(inout) :: io_time_avg, io_time_final
         real(wp), dimension(:), intent(inout) :: proc_time
         real(wp), dimension(:), intent(inout) :: io_proc_time
         logical, intent(inout) :: file_exists
-        real(wp), intent(inout) :: start, finish
-        integer, intent(inout) :: nt
 
         real(wp) :: grind_time
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 97c987327d..6631933e2b 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -66,12 +66,11 @@ contains
         end if
     end subroutine s_initialize_surface_tension_module
 
-    pure subroutine s_compute_capilary_source_flux(q_prim_vf, &
-                                                   vSrc_rsx_vf, vSrc_rsy_vf, vSrc_rsz_vf, &
-                                                   flux_src_vf, &
-                                                   id, isx, isy, isz)
+    pure subroutine s_compute_capilary_source_flux( &
+        vSrc_rsx_vf, vSrc_rsy_vf, vSrc_rsz_vf, &
+        flux_src_vf, &
+        id, isx, isy, isz)
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), dimension(-1:, 0:, 0:, 1:), intent(in) :: vSrc_rsx_vf
         real(wp), dimension(-1:, 0:, 0:, 1:), intent(in) :: vSrc_rsy_vf
         real(wp), dimension(-1:, 0:, 0:, 1:), intent(in) :: vSrc_rsz_vf
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 15fd95e52c..042b878c04 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -903,13 +903,13 @@ contains
         call nvtxStartRange("TIMESTEP")
 
         ! Stage 1 of 3
-        call s_adaptive_dt_bubble(t_step, 1)
+        call s_adaptive_dt_bubble(1)
 
         ! Stage 2 of 3
         call s_3rd_order_tvd_rk(t_step, time_avg)
 
         ! Stage 3 of 3
-        call s_adaptive_dt_bubble(t_step, 3)
+        call s_adaptive_dt_bubble(3)
 
         call nvtxEndRange
 
@@ -921,9 +921,9 @@ contains
 
     !> Bubble source part in Strang operator splitting scheme
         !! @param t_step Current time-step
-    impure subroutine s_adaptive_dt_bubble(t_step, stage)
+    impure subroutine s_adaptive_dt_bubble(stage)
 
-        integer, intent(in) :: t_step, stage
+        integer, intent(in) :: stage
 
         type(vector_field) :: gm_alpha_qp
 
@@ -931,18 +931,17 @@ contains
             q_cons_ts(1)%vf, &
             q_T_sf, &
             q_prim_vf, &
-            idwint, &
-            gm_alpha_qp%vf)
+            idwint)
 
         if (bubbles_euler) then
 
-            call s_compute_bubble_EE_source(q_cons_ts(1)%vf, q_prim_vf, t_step, rhs_vf)
+            call s_compute_bubble_EE_source(q_cons_ts(1)%vf, q_prim_vf, rhs_vf)
             call s_comp_alpha_from_n(q_cons_ts(1)%vf)
 
         elseif (bubbles_lagrange) then
 
             call s_populate_variables_buffers(q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf, bc_type)
-            call s_compute_bubble_EL_dynamics(q_cons_ts(1)%vf, q_prim_vf, t_step, rhs_vf, stage)
+            call s_compute_bubble_EL_dynamics(q_prim_vf, stage)
             call s_transfer_data_to_tmp()
             call s_smear_voidfraction()
             if (stage == 3) then
@@ -979,8 +978,7 @@ contains
             q_cons_ts(1)%vf, &
             q_T_sf, &
             q_prim_vf, &
-            idwint, &
-            gm_alpha_qp%vf)
+            idwint)
 
         $:PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 07815a9c46..2cc8797e27 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1006,18 +1006,18 @@ contains
             if (p > 0) then
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1_viscous, is2_viscous, is3_viscous)
             else
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1_viscous, is2_viscous, is3_viscous)
             end if
         else
             call s_weno(v_vf(iv%beg:iv%end), &
                         vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
-                        norm_dir, weno_dir, &
+                        weno_dir, &
                         is1_viscous, is2_viscous, is3_viscous)
         end if
 
@@ -1104,19 +1104,19 @@ contains
 
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1_viscous, is2_viscous, is3_viscous)
             else
                 call s_weno(v_vf(iv%beg:iv%end), &
                             vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
-                            norm_dir, weno_dir, &
+                            weno_dir, &
                             is1_viscous, is2_viscous, is3_viscous)
             end if
         else
 
             call s_weno(v_vf(iv%beg:iv%end), &
                         vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
-                        norm_dir, weno_dir, &
+                        weno_dir, &
                         is1_viscous, is2_viscous, is3_viscous)
         end if
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 1db77c8b45..17e0583687 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -639,13 +639,12 @@ contains
     end subroutine s_compute_weno_coefficients
 
     subroutine s_weno(v_vf, vL_rs_vf_x, vL_rs_vf_y, vL_rs_vf_z, vR_rs_vf_x, vR_rs_vf_y, vR_rs_vf_z, &
-                      norm_dir, weno_dir, &
+                      weno_dir, &
                       is1_weno_d, is2_weno_d, is3_weno_d)
 
         type(scalar_field), dimension(1:), intent(in) :: v_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vL_rs_vf_x, vL_rs_vf_y, vL_rs_vf_z
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vR_rs_vf_x, vR_rs_vf_y, vR_rs_vf_z
-        integer, intent(in) :: norm_dir
         integer, intent(in) :: weno_dir
         type(int_bounds_info), intent(in) :: is1_weno_d, is2_weno_d, is3_weno_d
 
@@ -668,7 +667,7 @@ contains
 
         if (weno_order /= 1) then
             call s_initialize_weno(v_vf, &
-                                   norm_dir, weno_dir)
+                                   weno_dir)
         end if
 
         if (weno_order == 1) then
@@ -1120,11 +1119,10 @@ contains
         !! @param is2_weno Index bounds in second coordinate direction
         !! @param is3_weno Index bounds in third coordinate direction
     subroutine s_initialize_weno(v_vf, &
-                                 norm_dir, weno_dir)
+                                 weno_dir)
 
         type(scalar_field), dimension(:), intent(IN) :: v_vf
 
-        integer, intent(IN) :: norm_dir
         integer, intent(IN) :: weno_dir
 
         integer :: j, k, l, q
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 57dcbfbdf6..29cb3b8281 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -76,20 +76,19 @@ program p_main
 
         if (cfl_dt) then
             if (mytime >= t_stop) then
-                call s_save_performance_metrics(t_step, time_avg, time_final, io_time_avg, &
-                                                io_time_final, proc_time, io_proc_time, file_exists, start, finish, nt)
+                call s_save_performance_metrics(time_avg, time_final, io_time_avg, &
+                                                io_time_final, proc_time, io_proc_time, file_exists)
                 exit
             end if
         else
             if (t_step == t_step_stop) then
-                call s_save_performance_metrics(t_step, time_avg, time_final, io_time_avg, &
-                                                io_time_final, proc_time, io_proc_time, file_exists, start, finish, nt)
+                call s_save_performance_metrics(time_avg, time_final, io_time_avg, &
+                                                io_time_final, proc_time, io_proc_time, file_exists)
                 exit
             end if
         end if
 
-        call s_perform_time_step(t_step, time_avg, time_final, io_time_avg, io_time_final, &
-                                 proc_time, io_proc_time, file_exists, start, finish, nt)
+        call s_perform_time_step(t_step, time_avg)
 
         if (cfl_dt) then
             if (abs(mod(mytime, t_save)) < dt .or. mytime >= t_stop) then

From 5b8d97905a2fb8900f297341ea1ea67968755571 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 16:14:18 -0400
Subject: [PATCH 30/75] move pressure relaxation to its own module + refac
 (#865)

---
 misc/length-subroutines.sh               |  29 +++
 src/simulation/m_pressure_relaxation.fpp | 315 +++++++++++++++++++++++
 src/simulation/m_rhs.fpp                 | 276 +-------------------
 src/simulation/m_time_steppers.fpp       |   2 +
 4 files changed, 351 insertions(+), 271 deletions(-)
 create mode 100755 misc/length-subroutines.sh
 create mode 100644 src/simulation/m_pressure_relaxation.fpp

diff --git a/misc/length-subroutines.sh b/misc/length-subroutines.sh
new file mode 100755
index 0000000000..b955e2d73a
--- /dev/null
+++ b/misc/length-subroutines.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Use gawk if available, otherwise fall back to awk
+if command -v gawk > /dev/null; then
+    AWK_CMD="gawk"
+    IGNORECASE_BLOCK='BEGIN { IGNORECASE = 1 }'
+else
+    AWK_CMD="awk"
+    IGNORECASE_BLOCK=''
+    echo "Warning: gawk not found. Case-insensitive matching may not work as expected." >&2
+fi
+
+find . -type f \( -name "*.f90" -o -name "*.fpp" \) | while read file; do
+    "$AWK_CMD" "
+    $IGNORECASE_BLOCK
+    /^[ \t]*((pure|elemental|impure)[ \t]+)*subroutine[ \t]+[a-zA-Z_][a-zA-Z0-9_]*[ \t]*\\(/ {
+        in_sub = 1
+        match(\$0, /subroutine[ \t]+([a-zA-Z_][a-zA-Z0-9_]*)/, arr)
+        sub_name = arr[1]
+        start_line = NR
+        next
+    }
+    /^[ \t]*end[ \t]+subroutine[ \t]*([a-zA-Z_][a-zA-Z0-9_]*)?[ \t]*\$/ && in_sub {
+        end_line = NR
+        print (end_line - start_line + 1) \"\t\" FILENAME \": \" sub_name
+        in_sub = 0
+    }
+    " "$file"
+done | sort -nr | awk -F'\t' '{print $2 " : " $1 " lines"}' | head -20
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
new file mode 100644
index 0000000000..8aa78ad88a
--- /dev/null
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -0,0 +1,315 @@
+!>
+!! @file m_pressure_relaxation.fpp
+!! @brief Contains module m_pressure_relaxation
+
+#:include 'case.fpp'
+#:include 'macros.fpp'
+
+!> @brief The module contains the subroutines used to perform pressure relaxation
+!!        for multi-component flows using the 6-equation model. This includes
+!!        volume fraction correction, Newton-Raphson pressure equilibration, and
+!!        internal energy correction to maintain thermodynamic consistency.
+module m_pressure_relaxation
+
+    use m_derived_types        !< Definitions of the derived types
+    use m_global_parameters    !< Definitions of the global parameters
+
+    implicit none
+
+    private; public :: s_pressure_relaxation_procedure, &
+ s_initialize_pressure_relaxation_module, &
+ s_finalize_pressure_relaxation_module
+
+    real(wp), allocatable, dimension(:) :: gamma_min, pres_inf
+    !$acc declare create(gamma_min, pres_inf)
+
+    real(wp), allocatable, dimension(:, :) :: Res
+    !$acc declare create(Res)
+
+contains
+
+    !> Initialize the pressure relaxation module
+    impure subroutine s_initialize_pressure_relaxation_module
+
+        integer :: i, j
+
+        @:ALLOCATE(gamma_min(1:num_fluids), pres_inf(1:num_fluids))
+
+        do i = 1, num_fluids
+            gamma_min(i) = 1._wp/fluid_pp(i)%gamma + 1._wp
+            pres_inf(i) = fluid_pp(i)%pi_inf/(1._wp + fluid_pp(i)%gamma)
+        end do
+        !$acc update device(gamma_min, pres_inf)
+
+        if (viscous) then
+            @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
+            do i = 1, 2
+                do j = 1, Re_size(i)
+                    Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
+                end do
+            end do
+            !$acc update device(Res, Re_idx, Re_size)
+        end if
+
+    end subroutine s_initialize_pressure_relaxation_module
+
+    !> Finalize the pressure relaxation module
+    impure subroutine s_finalize_pressure_relaxation_module
+
+        @:DEALLOCATE(gamma_min, pres_inf)
+        if (viscous) then
+            @:DEALLOCATE(Res)
+        end if
+
+    end subroutine s_finalize_pressure_relaxation_module
+
+    !> The main pressure relaxation procedure
+    !! @param q_cons_vf Cell-average conservative variables
+    pure subroutine s_pressure_relaxation_procedure(q_cons_vf)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer :: j, k, l
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    call s_relax_cell_pressure(q_cons_vf, j, k, l)
+                end do
+            end do
+        end do
+
+    end subroutine s_pressure_relaxation_procedure
+
+    !> Process pressure relaxation for a single cell
+    pure subroutine s_relax_cell_pressure(q_cons_vf, j, k, l)
+        !$acc routine seq
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+
+        ! Volume fraction correction
+        if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l)
+
+        ! Pressure equilibration
+        if (s_needs_pressure_relaxation(q_cons_vf, j, k, l)) then
+            call s_equilibrate_pressure(q_cons_vf, j, k, l)
+        end if
+
+        ! Internal energy correction
+        call s_correct_internal_energies(q_cons_vf, j, k, l)
+
+    end subroutine s_relax_cell_pressure
+
+    !> Check if pressure relaxation is needed for this cell
+    pure logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l)
+        !$acc routine seq
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+        integer :: i
+
+        s_needs_pressure_relaxation = .true.
+        !$acc loop seq
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
+                s_needs_pressure_relaxation = .false.
+            end if
+        end do
+
+    end function s_needs_pressure_relaxation
+
+    !> Correct volume fractions to physical bounds
+    pure subroutine s_correct_volume_fractions(q_cons_vf, j, k, l)
+        !$acc routine seq
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+        real(wp) :: sum_alpha
+        integer :: i
+
+        sum_alpha = 0._wp
+        !$acc loop seq
+        do i = 1, num_fluids
+            if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
+                (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
+                q_cons_vf(i + contxb - 1)%sf(j, k, l) = 0._wp
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 0._wp
+                q_cons_vf(i + intxb - 1)%sf(j, k, l) = 0._wp
+            end if
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > 1._wp) &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 1._wp
+            sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
+        end do
+
+        !$acc loop seq
+        do i = 1, num_fluids
+            q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
+        end do
+
+    end subroutine s_correct_volume_fractions
+
+    !> Main pressure equilibration using Newton-Raphson
+    pure subroutine s_equilibrate_pressure(q_cons_vf, j, k, l)
+        !$acc routine seq
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+
+        real(wp) :: pres_relax, f_pres, df_pres
+        real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+        integer, parameter :: MAX_ITER = 50
+        real(wp), parameter :: TOLERANCE = 1e-10_wp
+        integer :: iter, i
+
+        ! Initialize pressures
+        pres_relax = 0._wp
+        !$acc loop seq
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
+                pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
+                                  q_cons_vf(i + advxb - 1)%sf(j, k, l) - pi_infs(i))/gammas(i)
+                if (pres_K_init(i) <= -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp) &
+                    pres_K_init(i) = -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp
+            else
+                pres_K_init(i) = 0._wp
+            end if
+            pres_relax = pres_relax + q_cons_vf(i + advxb - 1)%sf(j, k, l)*pres_K_init(i)
+        end do
+
+        ! Newton-Raphson iteration
+        f_pres = 1e-9_wp
+        df_pres = 1e9_wp
+        !$acc loop seq
+        do iter = 0, MAX_ITER - 1
+            if (abs(f_pres) > TOLERANCE) then
+                pres_relax = pres_relax - f_pres/df_pres
+
+                ! Enforce pressure bounds
+                do i = 1, num_fluids
+                    if (pres_relax <= -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp) &
+                        pres_relax = -(1._wp - 1e-8_wp)*pres_inf(i) + 1._wp
+                end do
+
+                ! Newton-Raphson step
+                f_pres = -1._wp
+                df_pres = 0._wp
+                !$acc loop seq
+                do i = 1, num_fluids
+                    if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
+                        rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
+                                     max(q_cons_vf(i + advxb - 1)%sf(j, k, l), sgm_eps) &
+                                     *((pres_relax + pres_inf(i))/(pres_K_init(i) + &
+                                                                   pres_inf(i)))**(1._wp/gamma_min(i))
+                        f_pres = f_pres + q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
+                        df_pres = df_pres - q_cons_vf(i + contxb - 1)%sf(j, k, l) &
+                                  /(gamma_min(i)*rho_K_s(i)*(pres_relax + pres_inf(i)))
+                    end if
+                end do
+            end if
+        end do
+
+        ! Update volume fractions
+        !$acc loop seq
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
+        end do
+
+    end subroutine s_equilibrate_pressure
+
+    !> Correct internal energies using equilibrated pressure
+    pure subroutine s_correct_internal_energies(q_cons_vf, j, k, l)
+        !$acc routine seq
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+
+        real(wp), dimension(num_fluids) :: alpha_rho, alpha
+        real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
+        real(wp), dimension(2) :: Re
+        integer :: i, q
+
+        !$acc loop seq
+        do i = 1, num_fluids
+            alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
+            alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
+        end do
+
+        ! Compute mixture properties (combined bubble and standard logic)
+        rho = 0._wp
+        gamma = 0._wp
+        pi_inf = 0._wp
+
+        if (bubbles_euler) then
+            if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                !$acc loop seq
+                do i = 1, num_fluids
+                    rho = rho + alpha_rho(i)
+                    gamma = gamma + alpha(i)*gammas(i)
+                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
+                end do
+            else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                !$acc loop seq
+                do i = 1, num_fluids - 1
+                    rho = rho + alpha_rho(i)
+                    gamma = gamma + alpha(i)*gammas(i)
+                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
+                end do
+            else
+                rho = alpha_rho(1)
+                gamma = gammas(1)
+                pi_inf = pi_infs(1)
+            end if
+        else
+            sum_alpha = 0._wp
+            if (mpp_lim) then
+                !$acc loop seq
+                do i = 1, num_fluids
+                    alpha_rho(i) = max(0._wp, alpha_rho(i))
+                    alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
+                    sum_alpha = sum_alpha + alpha(i)
+                end do
+                alpha = alpha/max(sum_alpha, sgm_eps)
+            end if
+
+            !$acc loop seq
+            do i = 1, num_fluids
+                rho = rho + alpha_rho(i)
+                gamma = gamma + alpha(i)*gammas(i)
+                pi_inf = pi_inf + alpha(i)*pi_infs(i)
+            end do
+
+            if (viscous) then
+                !$acc loop seq
+                do i = 1, 2
+                    Re(i) = dflt_real
+                    if (Re_size(i) > 0) Re(i) = 0._wp
+                    !$acc loop seq
+                    do q = 1, Re_size(i)
+                        Re(i) = alpha(Re_idx(i, q))/Res(i, q) + Re(i)
+                    end do
+                    Re(i) = 1._wp/max(Re(i), sgm_eps)
+                end do
+            end if
+        end if
+
+        ! Compute dynamic pressure and update internal energies
+        dyn_pres = 0._wp
+        !$acc loop seq
+        do i = momxb, momxe
+            dyn_pres = dyn_pres + 5e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
+                       q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
+        end do
+
+        pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
+
+        !$acc loop seq
+        do i = 1, num_fluids
+            q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
+        end do
+
+    end subroutine s_correct_internal_energies
+
+end module m_pressure_relaxation
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 846f04370a..e3023de18c 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -64,11 +64,12 @@ module m_rhs
 
     use m_mhd
 
+    use m_pressure_relaxation
+
     implicit none
 
     private; public :: s_initialize_rhs_module, &
  s_compute_rhs, &
- s_pressure_relaxation_procedure, &
  s_finalize_rhs_module
 
     !! This variable contains the WENO-reconstructed values of the cell-average
@@ -159,12 +160,6 @@ module m_rhs
     $:DECLARE(create=["qL_rsx_vf","qL_rsy_vf","qL_rsz_vf","qR_rsx_vf","qR_rsy_vf","qR_rsz_vf"])
     $:DECLARE(create=["dqL_rsx_vf","dqL_rsy_vf","dqL_rsz_vf","dqR_rsx_vf","dqR_rsy_vf","dqR_rsz_vf"])
 
-    real(wp), allocatable, dimension(:) :: gamma_min, pres_inf
-    $:DECLARE(create=["gamma_min","pres_inf"])
-
-    real(wp), allocatable, dimension(:, :) :: Res
-    $:DECLARE(create=["Res"])
-
     real(wp), allocatable, dimension(:, :, :) :: nbub !< Bubble number density
     $:DECLARE(create=["nbub"])
 
@@ -572,26 +567,7 @@ contains
             @:ALLOCATE(blkmod1(0:m, 0:n, 0:p), blkmod2(0:m, 0:n, 0:p), alpha1(0:m, 0:n, 0:p), alpha2(0:m, 0:n, 0:p), Kterm(0:m, 0:n, 0:p))
         end if
 
-        @:ALLOCATE(gamma_min(1:num_fluids), pres_inf(1:num_fluids))
-
-        do i = 1, num_fluids
-            gamma_min(i) = 1._wp/fluid_pp(i)%gamma + 1._wp
-            pres_inf(i) = fluid_pp(i)%pi_inf/(1._wp + fluid_pp(i)%gamma)
-        end do
-        $:UPDATE(device=["gamma_min", "pres_inf"])
-
-        if (viscous) then
-            @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
-        end if
-
-        if (viscous) then
-            do i = 1, 2
-                do j = 1, Re_size(i)
-                    Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
-                end do
-            end do
-            $:UPDATE(device=["Res","Re_idx","Re_size"])
-        end if
+        call s_initialize_pressure_relaxation_module
 
         $:PARALLEL_LOOP(collapse=4)
         do id = 1, num_dims
@@ -1661,250 +1637,6 @@ contains
         !!      fraction of each phase are recomputed. For conservation
         !!      purpose, this pressure is finally corrected using the
         !!      mixture-total-energy equation.
-        !!  @param q_cons_vf Cell-average conservative variables
-    pure subroutine s_pressure_relaxation_procedure(q_cons_vf)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-
-        !> @name Relaxed pressure, initial partial pressures, function f(p) and its partial
-            !! derivative df(p), isentropic partial density, sum of volume fractions,
-            !! mixture density, dynamic pressure, surface energy, specific heat ratio
-            !! function, liquid stiffness function (two variations of the last two
-            !! ones), shear and volume Reynolds numbers and the Weber numbers
-        !> @{
-        real(wp) :: pres_relax
-        real(wp), dimension(num_fluids) :: pres_K_init
-        real(wp) :: f_pres
-        real(wp) :: df_pres
-        real(wp), dimension(num_fluids) :: rho_K_s
-        real(wp), dimension(num_fluids) :: alpha_rho
-        real(wp), dimension(num_fluids) :: alpha
-        real(wp) :: sum_alpha
-        real(wp) :: rho
-        real(wp) :: dyn_pres
-        real(wp) :: gamma
-        real(wp) :: pi_inf
-        real(wp), dimension(2) :: Re
-
-        integer :: i, j, k, l, q, iter !< Generic loop iterators
-        integer :: relax !< Relaxation procedure determination variable
-
-        $:PARALLEL_LOOP(collapse=3, private=["pres_K_init", "rho_K_s", &
-            "alpha_rho", "alpha", "Re", "pres_relax"])
-        do l = 0, p
-            do k = 0, n
-                do j = 0, m
-
-                    ! Numerical correction of the volume fractions
-                    if (mpp_lim) then
-                        sum_alpha = 0._wp
-
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
-                                (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
-                                q_cons_vf(i + contxb - 1)%sf(j, k, l) = 0._wp
-                                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 0._wp
-                                q_cons_vf(i + intxb - 1)%sf(j, k, l) = 0._wp
-                            end if
-
-                            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > 1._wp) &
-                                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 1._wp
-                            sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
-                        end do
-
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
-                        end do
-                    end if
-
-                    ! Pressures relaxation procedure
-
-                    ! Is the pressure relaxation procedure necessary?
-                    relax = 1
-
-                    $:LOOP()
-                    do i = 1, num_fluids
-                        if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) relax = 0
-                    end do
-
-                    if (relax == 1) then
-                        ! Initial state
-                        pres_relax = 0._wp
-
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
-                                pres_K_init(i) = &
-                                    (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
-                                     q_cons_vf(i + advxb - 1)%sf(j, k, l) &
-                                     - pi_infs(i))/gammas(i)
-
-                                if (pres_K_init(i) <= -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp) &
-                                    pres_K_init(i) = -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp
-                            else
-                                pres_K_init(i) = 0._wp
-                            end if
-                            pres_relax = pres_relax + q_cons_vf(i + advxb - 1)%sf(j, k, l)*pres_K_init(i)
-                        end do
-
-                        ! Iterative process for relaxed pressure determination
-                        f_pres = 1e-9_wp
-                        df_pres = 1e9_wp
-
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            rho_K_s(i) = 0._wp
-                        end do
-
-                        $:LOOP()
-                        do iter = 0, 49
-
-                            if (abs(f_pres) > 1e-10_wp) then
-                                pres_relax = pres_relax - f_pres/df_pres
-
-                                ! Physical pressure
-                                do i = 1, num_fluids
-                                    if (pres_relax <= -(1._wp - 1e-8_wp)*pres_inf(i) + 1e-8_wp) &
-                                        pres_relax = -(1._wp - 1e-8_wp)*pres_inf(i) + 1._wp
-                                end do
-
-                                ! Newton-Raphson method
-                                f_pres = -1._wp
-                                df_pres = 0._wp
-
-                                $:LOOP()
-                                do i = 1, num_fluids
-                                    if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
-                                        rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
-                                                     max(q_cons_vf(i + advxb - 1)%sf(j, k, l), sgm_eps) &
-                                                     *((pres_relax + pres_inf(i))/(pres_K_init(i) + &
-                                                                                   pres_inf(i)))**(1._wp/gamma_min(i))
-
-                                        f_pres = f_pres + q_cons_vf(i + contxb - 1)%sf(j, k, l) &
-                                                 /rho_K_s(i)
-
-                                        df_pres = df_pres - q_cons_vf(i + contxb - 1)%sf(j, k, l) &
-                                                  /(gamma_min(i)*rho_K_s(i)*(pres_relax + pres_inf(i)))
-                                    end if
-                                end do
-                            end if
-
-                        end do
-
-                        ! Cell update of the volume fraction
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
-                                q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l) &
-                                                                       /rho_K_s(i)
-                        end do
-                    end if
-
-                    ! Mixture-total-energy correction
-
-                    ! The mixture-total-energy correction of the mixture pressure P is not necessary here
-                    ! because the primitive variables are directly recovered later on by the conservative
-                    ! variables (see s_convert_conservative_to_primitive_variables called in s_compute_rhs).
-                    ! However, the internal-energy equations should be reset with the corresponding mixture
-                    ! pressure from the correction. This step is carried out below.
-
-                    $:LOOP()
-                    do i = 1, num_fluids
-                        alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
-                        alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
-                    end do
-
-                    if (bubbles_euler) then
-                        rho = 0._wp
-                        gamma = 0._wp
-                        pi_inf = 0._wp
-
-                        if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                            $:LOOP()
-                            do i = 1, num_fluids
-                                rho = rho + alpha_rho(i)
-                                gamma = gamma + alpha(i)*gammas(i)
-                                pi_inf = pi_inf + alpha(i)*pi_infs(i)
-                            end do
-                        else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                            $:LOOP()
-                            do i = 1, num_fluids - 1
-                                rho = rho + alpha_rho(i)
-                                gamma = gamma + alpha(i)*gammas(i)
-                                pi_inf = pi_inf + alpha(i)*pi_infs(i)
-                            end do
-                        else
-                            rho = alpha_rho(1)
-                            gamma = gammas(1)
-                            pi_inf = pi_infs(1)
-                        end if
-                    else
-                        rho = 0._wp
-                        gamma = 0._wp
-                        pi_inf = 0._wp
-
-                        sum_alpha = 0._wp
-
-                        if (mpp_lim) then
-                            $:LOOP()
-                            do i = 1, num_fluids
-                                alpha_rho(i) = max(0._wp, alpha_rho(i))
-                                alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
-                                sum_alpha = sum_alpha + alpha(i)
-                            end do
-
-                            alpha = alpha/max(sum_alpha, sgm_eps)
-
-                        end if
-
-                        $:LOOP()
-                        do i = 1, num_fluids
-                            rho = rho + alpha_rho(i)
-                            gamma = gamma + alpha(i)*gammas(i)
-                            pi_inf = pi_inf + alpha(i)*pi_infs(i)
-                        end do
-
-                        if (viscous) then
-                            $:LOOP()
-                            do i = 1, 2
-                                Re(i) = dflt_real
-
-                                if (Re_size(i) > 0) Re(i) = 0._wp
-                                $:LOOP()
-                                do q = 1, Re_size(i)
-                                    Re(i) = alpha(Re_idx(i, q))/Res(i, q) &
-                                            + Re(i)
-                                end do
-
-                                Re(i) = 1._wp/max(Re(i), sgm_eps)
-
-                            end do
-                        end if
-                    end if
-
-                    dyn_pres = 0._wp
-
-                    $:LOOP()
-                    do i = momxb, momxe
-                        dyn_pres = dyn_pres + 5e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
-                                   q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
-                    end do
-
-                    pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
-
-                    $:LOOP()
-                    do i = 1, num_fluids
-                        q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
-                            q_cons_vf(i + advxb - 1)%sf(j, k, l)* &
-                            (gammas(i)*pres_relax + pi_infs(i))
-                    end do
-                end do
-            end do
-        end do
-
-    end subroutine s_pressure_relaxation_procedure
 
     !>  The purpose of this subroutine is to WENO-reconstruct the
         !!      left and the right cell-boundary values, including values
@@ -2048,6 +1780,8 @@ contains
 
         integer :: i, j, l
 
+        call s_finalize_pressure_relaxation_module
+
         do j = cont_idx%beg, cont_idx%end
             if (relativity) then
                 ! Cons and Prim densities are different for relativity
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 042b878c04..b9d2594a05 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -19,6 +19,8 @@ module m_time_steppers
 
     use m_rhs                  !< Right-hane-side (RHS) evaluation procedures
 
+    use m_pressure_relaxation  !< Pressure relaxation procedures
+
     use m_data_output          !< Run-time info & solution data output procedures
 
     use m_bubbles_EE           !< Ensemble-averaged bubble dynamics routines

From 1ed95c0f0984de61af42a66a6231277fd74971c4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Jun 2025 01:45:01 -0400
Subject: [PATCH 31/75] Create .cursorrules (#866)

---
 .cursorrules | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 .cursorrules

diff --git a/.cursorrules b/.cursorrules
new file mode 100644
index 0000000000..a0451d35c7
--- /dev/null
+++ b/.cursorrules
@@ -0,0 +1,125 @@
+######################################################################
+#  .cursorrules  –  project-level instructions for Cursor AI         #
+#  Target repo: github.com/mflowcode/mfc                             #
+#  Format: plain-text "rules for the Agent". One idea per line.      #
+######################################################################
+
+### ───────────────────────────────────────────────────────────────────
+###  1. Global project context (ALWAYS applied)
+### ───────────────────────────────────────────────────────────────────
+
+- You are interacting with **MFC** – an exascale, many-physics, multi-component
+  flow solver written in modern Fortran and generated with the Fypp
+  pre-processor. Most source files use the `.fpp` extension (Fypp templates)
+  and are transpiled to `.f90` during the CMake build. A minority of modules
+  remain as plain `.f90`. The code lives in `src/`, tests in `tests/`,
+  and example input cases in `examples/`.
+
+- Fypp helper macros live in files under any `include/` directory within
+  `src/` (e.g., `src/<module>/include/`). **Scan these macro files first** so
+  that template expansions resolve correctly.
+
+- Always assume free-form Fortran 2008+ (`.fpp` and `.f90`) with `implicit none`,
+  explicit `intent(in|out|inout)`, and modern constructs (`allocate`,
+  `move_alloc`, `iso_fortran_env`, etc.).
+
+- Prefer `module` + `contains` → `subroutine foo()` patterns; avoid COMMON
+  blocks and `include` files.
+
+- **Simulation kernels are accelerated exclusively with OpenACC.** Always
+  include the appropriate `!$acc` pragmas when generating or modifying
+  performance-critical loops.
+
+- **The codebase is written in Fortran 2008+ with the Fypp pre-processor.**
+  Always use the appropriate Fypp macros when generating or modifying
+  performance-critical loops.
+
+- **Before suggesting or applying any source-code changes, first read/parse the
+  full codebase *and* the documentation sources listed below, then produce a
+  concise description of how the codebase is structured and functions.**
+
+- Documentation sources:
+  - Primary online docs: <https://mflowcode.github.io/documentation/md_readme.html>
+  - Root-level `README.md` in the repository.
+
+### ───────────────────────────────────────────────────────────────────
+###  2. Style & naming conventions  (AUTO-ATTACH for *.fpp and *.f90 files)
+### ───────────────────────────────────────────────────────────────────
+
+- Indent 2 spaces; continuation lines line up under “&”.
+- No maximum line length is enforced.
+- Prefer shorter, streamlined code—avoid needless verbosity while keeping clarity.
+- Lower-case keywords (`do`, `end subroutine`, etc.).
+- Module names: `m_<feature>` (e.g., `m_transport`).
+- Public procedures: `<verb>_<noun>` (e.g., `compute_flux`) with prefixe `s_` or `f_` for subroutines and functions, respectively.
+- Avoid private helper functions within a subroutine or function, and instead make them private to the module (if appropriate)
+- Helper functions or subroutines start with with f_[name here] or s_[name here], respectively
+- Avoid passing arguments to subroutines and functions that need to be reshaped in the subroutine or function, such that the compiler can optimize the code.
+- Avoid common Fortran code smells, such as:
+  - Use of `goto` statements (except in legacy interop situations).
+  - Omission of `intent(...)` declarations.
+  - Use of global state via `COMMON` blocks or `save` variables.
+  - Shadowing of variable names (especially loop indices).
+  - Non-descriptive or single-letter variable names outside small scopes.
+  - Excessively long helper procedures (>200 lines); prefer decomposition.
+  - Excessively long subroutines (>500 lines); prefer decomposition.
+  - Excessively long functions (>100 lines); prefer decomposition.
+  - Excessively long modules (>1000 lines); prefer decomposition.
+  - Excessively long files (>1000 lines); prefer decomposition.
+  - Subroutines and functions that are too long to be understood at a glance.
+  - Use of `!$acc routine seq` to mark subroutines and functions that are called from OpenACC parallel loops.
+  - Mixing I/O logic with computation (separate concerns).
+  - Missing implicit none statements.
+  - Missing `intent(in|out|inout)` declarations.
+  - Missing `dimension` or `allocatable`/`pointer` attributes.
+  - Missing `private` declarations.
+  - Missing `public` declarations.
+  - Missing `module procedure` declarations.
+  - Missing `use` statements.
+  - Declaring arrays without `dimension` or `allocatable`/`pointer` attributes.
+  - Use of `stop` for error handling (prefer use of the subroutine `s_mpi_abort` with an appropriate string message argument).
+
+### ───────────────────────────────────────────────────────────────────
+###  3. Test workflow  (AUTOMATICALLY applied when the code is significantly changed and should be tested)
+### ───────────────────────────────────────────────────────────────────
+
+- Run the test suite after any meaningful code change to verify correctness.
+- Ask the user to alter the test suite command so that only the relevant tests are run.
+- When you think you are done with the code changes, run ./mfc.sh test -j $(nproc) -f EA8FA07E -t 9E2CA336
+- Otherwise, only run specific tests that are relevant to the code changes.
+- Do not run ./mfc.sh test -j $(nproc) without any other arguments (it takes too long to run all tests).
+
+### ───────────────────────────────────────────────────────────────────
+###  4. OpenACC programming guidelines  (AUTOMATICALLY applied to *.fpp/*.f90)
+### ───────────────────────────────────────────────────────────────────
+
+- **Prefer:**
+
+  !$acc parallel loop gang vector default(present)
+
+  around tight loops over cells or particles; fall back to `!$acc kernels`
+  only when loop dependencies prevent direct parallelization. Always add
+  `reduction` clauses where needed.
+
+- Use `collapse(n)` in `!$acc parallel loop` pragmas to combine nested loops
+  when it improves parallelism and the iterations are independent.
+
+- Use `private(var1, var2, ...)` clauses for all variables that are local
+  to a parallel region or loop, especially scalar temporaries and loop counters.
+
+- Use `!$acc routine seq` to mark subroutines and functions that are called from OpenACC parallel loops. 
+
+- When adding `!$acc routine seq` put it on the first line after the declaration of the subroutine or function.
+
+- Allocate large arrays with the `managed` attribute or move them to the device
+  at program start using a persistent `!$acc enter data` region.
+
+- Do **not** place `stop` or `error stop` statements inside OpenACC parallel regions or loops, as they are unsupported in device code and will cause runtime failures.
+
+- Ensure the code compiles with Cray Fortran (`ftn`) and NVIDIA HPC SDK
+  (`nvfortran`) for OpenACC GPU offloading, and with GNU (`gfortran`) and Intel
+  (`ifx`/`ifort`) for CPU-only builds where OpenACC directives are ignored.
+
+######################################################################
+# End of file
+######################################################################

From 43b498a3e2bb5162fd334e7aae723265e5721a7e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Jun 2025 09:03:22 -0400
Subject: [PATCH 32/75] fix cursorrules (#867)

---
 .cursor/rules/mfc-agent-rules.mdc | 87 +++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 .cursor/rules/mfc-agent-rules.mdc

diff --git a/.cursor/rules/mfc-agent-rules.mdc b/.cursor/rules/mfc-agent-rules.mdc
new file mode 100644
index 0000000000..a323c4cdd5
--- /dev/null
+++ b/.cursor/rules/mfc-agent-rules.mdc
@@ -0,0 +1,87 @@
+---
+description: 
+globs: 
+alwaysApply: true
+---
+---
+description: Full MFC project rules – consolidated for Agent Mode
+alwaysApply: true
+---
+
+# 0  Purpose & Scope
+Consolidated guidance for the MFC exascale, many-physics solver.  
+Written primarily for Fortran/Fypp; the OpenACC and style sections matter only when
+`.fpp` / `.f90` files are in view.
+
+---
+
+# 1  Global Project Context (always)
+- **Project**: *MFC* is modern Fortran 2008+ generated with **Fypp**.  
+  - Sources `src/`, tests `tests/`, examples `examples/`.  
+  - Most sources are `.fpp`; CMake transpiles them to `.f90`.  
+- **Fypp macros** live in `src/<subprogram>/include/` you should scan these first.  
+  `<subprogram>` ∈ {`simulation`,`common`,`pre_process`,`post_process`}.  
+- Only `simulation` (+ its `common` calls) is GPU-accelerated via **OpenACC**.  
+- Assume free-form Fortran 2008+, `implicit none`, explicit `intent`, and modern
+  intrinsics.  
+- Prefer `module … contains … subroutine foo()`; avoid `COMMON` blocks and
+  file-level `include` files.  
+- **Read the full codebase and docs *before* changing code.**  
+  Docs: <https://mflowcode.github.io/documentation/md_readme.html> and the respository root `README.md`.  
+
+### Incremental-change workflow
+1. Draft a step-by-step plan.  
+2. After each step, build:  
+   ```bash
+   ./mfc.sh build -t pre_process simulation -j $(nproc)
+    ```
+3. If it compiles, run focused tests:
+   ```bash
+   ./mfc.sh test -j $(nproc) -f EA8FA07E -t 9E2CA336
+   ```
+4. Roll back & fix if a step fails.
+
+* Do not run ./mfc.sh test -j $(nproc) without any other arguments (it takes too long to run all tests).
+
+---
+
+# 2  Style & Naming Conventions (for \*.fpp / \*.f90)
+
+* **Indent 2 spaces**; continuation lines align under `&`.
+* Lower-case keywords and intrinsics (`do`, `end subroutine`, …).
+* **Modules**: `m_<feature>` (e.g. `m_transport`).
+* **Public procedures**:
+  * Subroutine → `s_<verb>_<noun>` (e.g. `s_compute_flux`)
+  * Function   → `f_<verb>_<noun>`
+* Private helpers stay in the module; avoid nested procedures.
+* **Size limits**: subroutine ≤ 500 lines, helper ≤ 150, function ≤ 100,
+  module/file ≤ 1000.
+* ≤ 6 arguments per routine; otherwise pass a derived-type “params” struct.
+* No `goto` (except unavoidable legacy); no global state (`COMMON`, `save`).
+* Every variable: `intent(in|out|inout)` + appropriate `dimension` / `allocatable`
+  / `pointer`.
+* Use `s_mpi_abort(<msg>)` for errors, not `stop`.
+* Mark OpenACC-callable helpers that are called from OpenACC parallel loops immediately after declaration:
+  ```fortran
+  subroutine s_flux_update(...)
+    !$acc routine seq
+    ...
+  end subroutine
+  ```
+
+---
+
+# 3  OpenACC Programming Guidelines (for kernels)
+
+Wrap tight loops with
+
+```fortran
+!$acc parallel loop gang vector default(present) reduction(...)
+```
+* Add `collapse(n)` to merge nested loops when safe.
+* Declare loop-local variables with `private(...)`.
+* Allocate large arrays with `managed` or move them into a persistent
+  `!$acc enter data` region at start-up.
+* **Do not** place `stop` / `error stop` inside device code.
+* Must compile with Cray `ftn` and NVIDIA `nvfortran` for GPU offloading; also build CPU-only with
+  GNU `gfortran` and Intel `ifx`/`ifort`.

From 5d87f90e2f9a0d0d48e0532afdb45fad431c43fb Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Jun 2025 09:04:05 -0400
Subject: [PATCH 33/75] Delete .cursorrules (#868)

---
 .cursorrules | 125 ---------------------------------------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 .cursorrules

diff --git a/.cursorrules b/.cursorrules
deleted file mode 100644
index a0451d35c7..0000000000
--- a/.cursorrules
+++ /dev/null
@@ -1,125 +0,0 @@
-######################################################################
-#  .cursorrules  –  project-level instructions for Cursor AI         #
-#  Target repo: github.com/mflowcode/mfc                             #
-#  Format: plain-text "rules for the Agent". One idea per line.      #
-######################################################################
-
-### ───────────────────────────────────────────────────────────────────
-###  1. Global project context (ALWAYS applied)
-### ───────────────────────────────────────────────────────────────────
-
-- You are interacting with **MFC** – an exascale, many-physics, multi-component
-  flow solver written in modern Fortran and generated with the Fypp
-  pre-processor. Most source files use the `.fpp` extension (Fypp templates)
-  and are transpiled to `.f90` during the CMake build. A minority of modules
-  remain as plain `.f90`. The code lives in `src/`, tests in `tests/`,
-  and example input cases in `examples/`.
-
-- Fypp helper macros live in files under any `include/` directory within
-  `src/` (e.g., `src/<module>/include/`). **Scan these macro files first** so
-  that template expansions resolve correctly.
-
-- Always assume free-form Fortran 2008+ (`.fpp` and `.f90`) with `implicit none`,
-  explicit `intent(in|out|inout)`, and modern constructs (`allocate`,
-  `move_alloc`, `iso_fortran_env`, etc.).
-
-- Prefer `module` + `contains` → `subroutine foo()` patterns; avoid COMMON
-  blocks and `include` files.
-
-- **Simulation kernels are accelerated exclusively with OpenACC.** Always
-  include the appropriate `!$acc` pragmas when generating or modifying
-  performance-critical loops.
-
-- **The codebase is written in Fortran 2008+ with the Fypp pre-processor.**
-  Always use the appropriate Fypp macros when generating or modifying
-  performance-critical loops.
-
-- **Before suggesting or applying any source-code changes, first read/parse the
-  full codebase *and* the documentation sources listed below, then produce a
-  concise description of how the codebase is structured and functions.**
-
-- Documentation sources:
-  - Primary online docs: <https://mflowcode.github.io/documentation/md_readme.html>
-  - Root-level `README.md` in the repository.
-
-### ───────────────────────────────────────────────────────────────────
-###  2. Style & naming conventions  (AUTO-ATTACH for *.fpp and *.f90 files)
-### ───────────────────────────────────────────────────────────────────
-
-- Indent 2 spaces; continuation lines line up under “&”.
-- No maximum line length is enforced.
-- Prefer shorter, streamlined code—avoid needless verbosity while keeping clarity.
-- Lower-case keywords (`do`, `end subroutine`, etc.).
-- Module names: `m_<feature>` (e.g., `m_transport`).
-- Public procedures: `<verb>_<noun>` (e.g., `compute_flux`) with prefixe `s_` or `f_` for subroutines and functions, respectively.
-- Avoid private helper functions within a subroutine or function, and instead make them private to the module (if appropriate)
-- Helper functions or subroutines start with with f_[name here] or s_[name here], respectively
-- Avoid passing arguments to subroutines and functions that need to be reshaped in the subroutine or function, such that the compiler can optimize the code.
-- Avoid common Fortran code smells, such as:
-  - Use of `goto` statements (except in legacy interop situations).
-  - Omission of `intent(...)` declarations.
-  - Use of global state via `COMMON` blocks or `save` variables.
-  - Shadowing of variable names (especially loop indices).
-  - Non-descriptive or single-letter variable names outside small scopes.
-  - Excessively long helper procedures (>200 lines); prefer decomposition.
-  - Excessively long subroutines (>500 lines); prefer decomposition.
-  - Excessively long functions (>100 lines); prefer decomposition.
-  - Excessively long modules (>1000 lines); prefer decomposition.
-  - Excessively long files (>1000 lines); prefer decomposition.
-  - Subroutines and functions that are too long to be understood at a glance.
-  - Use of `!$acc routine seq` to mark subroutines and functions that are called from OpenACC parallel loops.
-  - Mixing I/O logic with computation (separate concerns).
-  - Missing implicit none statements.
-  - Missing `intent(in|out|inout)` declarations.
-  - Missing `dimension` or `allocatable`/`pointer` attributes.
-  - Missing `private` declarations.
-  - Missing `public` declarations.
-  - Missing `module procedure` declarations.
-  - Missing `use` statements.
-  - Declaring arrays without `dimension` or `allocatable`/`pointer` attributes.
-  - Use of `stop` for error handling (prefer use of the subroutine `s_mpi_abort` with an appropriate string message argument).
-
-### ───────────────────────────────────────────────────────────────────
-###  3. Test workflow  (AUTOMATICALLY applied when the code is significantly changed and should be tested)
-### ───────────────────────────────────────────────────────────────────
-
-- Run the test suite after any meaningful code change to verify correctness.
-- Ask the user to alter the test suite command so that only the relevant tests are run.
-- When you think you are done with the code changes, run ./mfc.sh test -j $(nproc) -f EA8FA07E -t 9E2CA336
-- Otherwise, only run specific tests that are relevant to the code changes.
-- Do not run ./mfc.sh test -j $(nproc) without any other arguments (it takes too long to run all tests).
-
-### ───────────────────────────────────────────────────────────────────
-###  4. OpenACC programming guidelines  (AUTOMATICALLY applied to *.fpp/*.f90)
-### ───────────────────────────────────────────────────────────────────
-
-- **Prefer:**
-
-  !$acc parallel loop gang vector default(present)
-
-  around tight loops over cells or particles; fall back to `!$acc kernels`
-  only when loop dependencies prevent direct parallelization. Always add
-  `reduction` clauses where needed.
-
-- Use `collapse(n)` in `!$acc parallel loop` pragmas to combine nested loops
-  when it improves parallelism and the iterations are independent.
-
-- Use `private(var1, var2, ...)` clauses for all variables that are local
-  to a parallel region or loop, especially scalar temporaries and loop counters.
-
-- Use `!$acc routine seq` to mark subroutines and functions that are called from OpenACC parallel loops. 
-
-- When adding `!$acc routine seq` put it on the first line after the declaration of the subroutine or function.
-
-- Allocate large arrays with the `managed` attribute or move them to the device
-  at program start using a persistent `!$acc enter data` region.
-
-- Do **not** place `stop` or `error stop` statements inside OpenACC parallel regions or loops, as they are unsupported in device code and will cause runtime failures.
-
-- Ensure the code compiles with Cray Fortran (`ftn`) and NVIDIA HPC SDK
-  (`nvfortran`) for OpenACC GPU offloading, and with GNU (`gfortran`) and Intel
-  (`ifx`/`ifort`) for CPU-only builds where OpenACC directives are ignored.
-
-######################################################################
-# End of file
-######################################################################

From 28d97009d7c9cb0a4334eb37e98628fabb7274e1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Jun 2025 09:59:50 -0400
Subject: [PATCH 34/75] Update mfc-agent-rules.mdc (#869)

---
 .cursor/rules/mfc-agent-rules.mdc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.cursor/rules/mfc-agent-rules.mdc b/.cursor/rules/mfc-agent-rules.mdc
index a323c4cdd5..02fe5d7e0e 100644
--- a/.cursor/rules/mfc-agent-rules.mdc
+++ b/.cursor/rules/mfc-agent-rules.mdc
@@ -1,9 +1,4 @@
 ---
-description: 
-globs: 
-alwaysApply: true
----
----
 description: Full MFC project rules – consolidated for Agent Mode
 alwaysApply: true
 ---

From 46d55d81e29b0de49ac6d717e7ea12699298131f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Jun 2025 23:23:15 -0400
Subject: [PATCH 35/75] Make CBC not so terrible (#870)

---
 src/simulation/m_compute_cbc.fpp | 300 ++++++++++++-------------------
 1 file changed, 112 insertions(+), 188 deletions(-)

diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 421f4ab07a..5880361461 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -1,12 +1,10 @@
 #:include 'directive_macros.fpp'
 !>
 !! @file m_compute_cbc.f90
-!! @brief Contains module m_compute_cbc
+!! @brief CBC computation module
 
 module m_compute_cbc
-
-    use m_global_parameters    !< Definitions of the global parameters
-
+    use m_global_parameters
     implicit none
 
     private; public :: s_compute_slip_wall_L, &
@@ -19,11 +17,72 @@ module m_compute_cbc
  s_compute_supersonic_outflow_L
 
 contains
+    !> Base L1 calculation
+    pure function f_base_L1(lambda, rho, c, dpres_ds, dvel_ds) result(L1)
+        !$acc routine seq
+        real(wp), dimension(3), intent(in) :: lambda
+        real(wp), intent(in) :: rho, c, dpres_ds
+        real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        real(wp) :: L1
+        L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
+    end function f_base_L1
+
+    !> Fill density L variables
+    pure subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
+        !$acc routine seq
+        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), intent(in) :: lambda_factor, lambda2, c
+        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+        real(wp), intent(in) :: dpres_ds
+        integer :: i
+
+        do i = 2, momxb
+            L(i) = lambda_factor*lambda2*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
+        end do
+    end subroutine s_fill_density_L
+
+    !> Fill velocity L variables
+    pure subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
+        !$acc routine seq
+        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), intent(in) :: lambda_factor, lambda2
+        real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        integer :: i
+
+        do i = momxb + 1, momxe
+            L(i) = lambda_factor*lambda2*dvel_ds(dir_idx(i - contxe))
+        end do
+    end subroutine s_fill_velocity_L
+
+    !> Fill advection L variables
+    pure subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
+        !$acc routine seq
+        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), intent(in) :: lambda_factor, lambda2
+        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+        integer :: i
+
+        do i = E_idx, advxe - 1
+            L(i) = lambda_factor*lambda2*dadv_ds(i - momxe)
+        end do
+    end subroutine s_fill_advection_L
+
+    !> Fill chemistry L variables
+    pure subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
+        !$acc routine seq
+        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), intent(in) :: lambda_factor, lambda2
+        real(wp), dimension(num_species), intent(in) :: dYs_ds
+        integer :: i
+
+        if (.not. chemistry) return
+
+        do i = chemxb, chemxe
+            L(i) = lambda_factor*lambda2*dYs_ds(i - chemxb + 1)
+        end do
+    end subroutine s_fill_chemistry_L
 
-    !>  The L variables for the slip wall CBC, see pg. 451 of
-        !!      Thompson (1990). At the slip wall (frictionless wall),
-        !!      the normal component of velocity is zero at all times,
-        !!      while the transverse velocities may be nonzero.
+    !> Slip wall CBC (Thompson 1990, pg. 451)
     pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
@@ -32,26 +91,16 @@ contains
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
-        real(wp), intent(in) :: dpres_ds
+        real(wp), intent(in) :: rho, c, dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
-
         integer :: i
 
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, advxe
-            L(i) = 0._wp
-        end do
-
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        L(2:advxe - 1) = 0._wp
         L(advxe) = L(1)
-
     end subroutine s_compute_slip_wall_L
 
-    !>  The L variables for the nonreflecting subsonic buffer CBC
-        !!      see pg. 13 of Thompson (1987). The nonreflecting subsonic
-        !!      buffer reduces the amplitude of any reflections caused by
-        !!      outgoing waves.
+    !> Nonreflecting subsonic buffer CBC (Thompson 1987, pg. 13)
     pure subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
@@ -66,42 +115,22 @@ contains
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
+        real(wp) :: lambda_factor
 
-        integer :: i !< Generic loop iterator
+        lambda_factor = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(1)))
+        L(1) = lambda_factor*lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
 
-        L(1) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(1)))*lambda(1) &
-               *(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, momxb
-            L(i) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(2)))*lambda(2) &
-                   *(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
-        end do
-
-        do i = momxb + 1, momxe
-            L(i) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(2)))*lambda(2) &
-                   *(dvel_ds(dir_idx(i - contxe)))
-        end do
-
-        do i = E_idx, advxe - 1
-            L(i) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(2)))*lambda(2) &
-                   *(dadv_ds(i - momxe))
-        end do
-
-        L(advxe) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(3)))*lambda(3) &
-                   *(dpres_ds + rho*c*dvel_ds(dir_idx(1)))
-
-        if (chemistry) then
-            do i = chemxb, chemxe
-                L(i) = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(2)))*lambda(2) &
-                       *(dYs_ds(i - chemxb + 1))
-            end do
-        end if
+        lambda_factor = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(2)))
+        call s_fill_density_L(L, lambda_factor, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
+        call s_fill_velocity_L(L, lambda_factor, lambda(2), dvel_ds)
+        call s_fill_advection_L(L, lambda_factor, lambda(2), dadv_ds)
+        call s_fill_chemistry_L(L, lambda_factor, lambda(2), dYs_ds)
 
+        lambda_factor = (5e-1_wp - 5e-1_wp*sign(1._wp, lambda(3)))
+        L(advxe) = lambda_factor*lambda(3)*(dpres_ds + rho*c*dvel_ds(dir_idx(1)))
     end subroutine s_compute_nonreflecting_subsonic_buffer_L
-    !>  The L variables for the nonreflecting subsonic inflow CBC
-        !!      see pg. 455, Thompson (1990). This nonreflecting subsonic
-        !!      CBC assumes an incoming flow and reduces the amplitude of
-        !!      any reflections caused by outgoing waves.
+
+    !> Nonreflecting subsonic inflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
@@ -110,30 +139,15 @@ contains
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
-        real(wp), intent(in) :: dpres_ds
+        real(wp), intent(in) :: rho, c, dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
 
-        integer :: i
-
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, advxe
-            L(i) = 0._wp
-        end do
-
-        if (chemistry) then
-            do i = chemxb, chemxe
-                L(i) = 0._wp
-            end do
-        end if
-
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        L(2:advxe) = 0._wp
+        if (chemistry) L(chemxb:chemxe) = 0._wp
     end subroutine s_compute_nonreflecting_subsonic_inflow_L
 
-    !>  The L variables for the nonreflecting subsonic outflow
-        !!      CBC see pg. 454 of Thompson (1990). This nonreflecting
-        !!      subsonic CBC presumes an outgoing flow and reduces the
-        !!      amplitude of any reflections caused by outgoing waves.
+    !> Nonreflecting subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
@@ -149,40 +163,15 @@ contains
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
 
-        integer :: i !> Generic loop iterator
-
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, momxb
-            L(i) = lambda(2)*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
-        end do
-
-        do i = momxb + 1, momxe
-            L(i) = lambda(2)*(dvel_ds(dir_idx(i - contxe)))
-        end do
-
-        do i = E_idx, advxe - 1
-            L(i) = lambda(2)*(dadv_ds(i - momxe))
-        end do
-
-        ! bubble index
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
+        call s_fill_velocity_L(L, 1._wp, lambda(2), dvel_ds)
+        call s_fill_advection_L(L, 1._wp, lambda(2), dadv_ds)
+        call s_fill_chemistry_L(L, 1._wp, lambda(2), dYs_ds)
         L(advxe) = 0._wp
-
-        if (chemistry) then
-            do i = chemxb, chemxe
-                L(i) = lambda(2)*dYs_ds(i - chemxb + 1)
-            end do
-        end if
-
     end subroutine s_compute_nonreflecting_subsonic_outflow_L
 
-    !>  The L variables for the force-free subsonic outflow CBC,
-        !!      see pg. 454 of Thompson (1990). The force-free subsonic
-        !!      outflow sets to zero the sum of all of the forces which
-        !!      are acting on a fluid element for the normal coordinate
-        !!      direction to the boundary. As a result, a fluid element
-        !!      at the boundary is simply advected outward at the fluid
-        !!      velocity.
+    !> Force-free subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
@@ -197,30 +186,14 @@ contains
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
 
-        integer :: i !> Generic loop iterator
-
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, momxb
-            L(i) = lambda(2)*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
-        end do
-
-        do i = momxb + 1, momxe
-            L(i) = lambda(2)*(dvel_ds(dir_idx(i - contxe)))
-        end do
-
-        do i = E_idx, advxe - 1
-            L(i) = lambda(2)*(dadv_ds(i - momxe))
-        end do
-
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
+        call s_fill_velocity_L(L, 1._wp, lambda(2), dvel_ds)
+        call s_fill_advection_L(L, 1._wp, lambda(2), dadv_ds)
         L(advxe) = L(1) + 2._wp*rho*c*lambda(2)*dvel_ds(dir_idx(1))
-
     end subroutine s_compute_force_free_subsonic_outflow_L
 
-    !>  L variables for the constant pressure subsonic outflow
-        !!      CBC see pg. 455 Thompson (1990). The constant pressure
-        !!      subsonic outflow maintains a fixed pressure at the CBC
-        !!      boundary in absence of any transverse effects.
+    !> Constant pressure subsonic outflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
@@ -235,31 +208,14 @@ contains
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
 
-        integer :: i !> Generic loop iterator
-
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, momxb
-            L(i) = lambda(2)*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
-        end do
-
-        do i = momxb + 1, momxe
-            L(i) = lambda(2)*(dvel_ds(dir_idx(i - contxe)))
-        end do
-
-        do i = E_idx, advxe - 1
-            L(i) = lambda(2)*(dadv_ds(i - momxe))
-        end do
-
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
+        call s_fill_velocity_L(L, 1._wp, lambda(2), dvel_ds)
+        call s_fill_advection_L(L, 1._wp, lambda(2), dadv_ds)
         L(advxe) = -L(1)
-
     end subroutine s_compute_constant_pressure_subsonic_outflow_L
 
-    !>  L variables for the supersonic inflow CBC, see pg. 453
-        !!      Thompson (1990). The supersonic inflow CBC is a steady
-        !!      state, or nearly a steady state, CBC in which only the
-        !!      transverse terms may generate a time dependence at the
-        !!      inflow boundary.
+    !> Supersonic inflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_inflow_L(L)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
@@ -267,25 +223,11 @@ contains
         $:ROUTINE()
 #endif
         real(wp), dimension(sys_size), intent(inout) :: L
-
-        integer :: i
-
-        do i = 1, advxe
-            L(i) = 0._wp
-        end do
-
-        if (chemistry) then
-            do i = chemxb, chemxe
-                L(i) = 0._wp
-            end do
-        end if
-
+        L(1:advxe) = 0._wp
+        if (chemistry) L(chemxb:chemxe) = 0._wp
     end subroutine s_compute_supersonic_inflow_L
 
-    !>  L variables for the supersonic outflow CBC, see pg. 453
-        !!      of Thompson (1990). For the supersonic outflow CBC, the
-        !!      flow evolution at the boundary is determined completely
-        !!      by the interior data.
+    !> Supersonic outflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
@@ -300,30 +242,12 @@ contains
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
-        integer :: i !< Generic loop iterator
-
-        L(1) = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
-
-        do i = 2, momxb
-            L(i) = lambda(2)*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds)
-        end do
-
-        do i = momxb + 1, momxe
-            L(i) = lambda(2)*(dvel_ds(dir_idx(i - contxe)))
-        end do
-
-        do i = E_idx, advxe - 1
-            L(i) = lambda(2)*(dadv_ds(i - momxe))
-        end do
 
+        L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
+        call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
+        call s_fill_velocity_L(L, 1._wp, lambda(2), dvel_ds)
+        call s_fill_advection_L(L, 1._wp, lambda(2), dadv_ds)
+        call s_fill_chemistry_L(L, 1._wp, lambda(2), dYs_ds)
         L(advxe) = lambda(3)*(dpres_ds + rho*c*dvel_ds(dir_idx(1)))
-
-        if (chemistry) then
-            do i = chemxb, chemxe
-                L(i) = lambda(2)*dYs_ds(i - chemxb + 1)
-            end do
-        end if
-
     end subroutine s_compute_supersonic_outflow_L
-
 end module m_compute_cbc

From 1f3b8f0f41456ce5375779c2ebdbc2f99fc2dcb9 Mon Sep 17 00:00:00 2001
From: "Mohammed S. Al-Mahrouqi"
 <145478595+mohdsaid497566@users.noreply.github.com>
Date: Wed, 11 Jun 2025 09:38:07 -0400
Subject: [PATCH 36/75] changed output file from default (1) to run_time.inf
 (3) (#504) (#877)

Co-authored-by: mohdsaid497566 <mohdsaid497566@gmail.com>
---
 src/simulation/m_data_output.fpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index fdf81d528a..7407375f86 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -151,18 +151,18 @@ contains
         ! Generating table header for the stability criteria to be outputted
         if (cfl_dt) then
             if (viscous) then
-                write (1, '(A)') '     Time-steps        dt     = Time         ICFL '// &
+                write (3, '(A)') '     Time-steps        dt     = Time         ICFL '// &
                     'Max      VCFL Max        Rc Min       ='
             else
-                write (1, '(A)') '            Time-steps                dt       Time '// &
+                write (3, '(A)') '            Time-steps                dt       Time '// &
                     '               ICFL Max              '
             end if
         else
             if (viscous) then
-                write (1, '(A)') '     Time-steps        Time         ICFL '// &
+                write (3, '(A)') '     Time-steps        Time         ICFL '// &
                     'Max      VCFL Max        Rc Min        '
             else
-                write (1, '(A)') '            Time-steps                Time '// &
+                write (3, '(A)') '            Time-steps                Time '// &
                     '               ICFL Max              '
             end if
         end if
@@ -353,12 +353,12 @@ contains
         ! Outputting global stability criteria extrema at current time-step
         if (proc_rank == 0) then
             if (viscous) then
-                write (1, '(6X,I8,F10.6,6X,6X,F10.6,6X,F9.6,6X,F9.6,6X,F10.6)') &
+                write (3, '(6X,I8,F10.6,6X,6X,F10.6,6X,F9.6,6X,F9.6,6X,F10.6)') &
                     t_step, dt, t_step*dt, icfl_max_glb, &
                     vcfl_max_glb, &
                     Rc_min_glb
             else
-                write (1, '(13X,I8,14X,F10.6,14X,F10.6,13X,F9.6)') &
+                write (3, '(13X,I8,14X,F10.6,14X,F10.6,13X,F9.6)') &
                     t_step, dt, t_step*dt, icfl_max_glb
             end if
 

From bea95b5bec2854fdeeae8f6462a68092c0943523 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 16:29:39 -0400
Subject: [PATCH 37/75] Fixed issues with macro

---
 src/common/m_finite_differences.fpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 80db1d8c79..18a0a84484 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -18,7 +18,8 @@ contains
         integer :: x, y, z !< Generic loop iterators
 
         real(wp) :: divergence
-        $:PARALLEL_LOOP(collapse=3, gang vector default(present) private=["divergence"])
+
+        $:PARALLEL_LOOP(collapse=3, private=["divergence"])
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end

From c6d4a0465333ed4d30675b5dbf2137711ecb83be Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 17:19:49 -0400
Subject: [PATCH 38/75] Replaced Openacc routine directives

---
 src/simulation/m_qbmm.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 72a0114865..dbc78a69d1 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -865,7 +865,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            !$acc routine seq
+            $:ROUTINE()
 #endif
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff

From a6dac4df28f4cd2e6d4a796228120f5ba474f0ca Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 17:26:30 -0400
Subject: [PATCH 39/75] Added update device macros

---
 src/simulation/m_viscous.fpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 2cc8797e27..0ea46cdd90 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1000,7 +1000,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         if (n > 0) then
             if (p > 0) then
@@ -1097,7 +1097,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         if (n > 0) then
             if (p > 0) then
@@ -1200,7 +1200,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then

From d1b0e73611635c59dc07e4f6914ed4fb53cc0e74 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 11 Jun 2025 17:30:59 -0400
Subject: [PATCH 40/75] Replaced rest of parallel loop directives

---
 src/simulation/m_viscous.fpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 0ea46cdd90..2cc8797e27 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1000,7 +1000,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         if (n > 0) then
             if (p > 0) then
@@ -1097,7 +1097,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         if (n > 0) then
             if (p > 0) then
@@ -1200,7 +1200,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then

From 0d2d6ac2bdf6fcef877c046b1d2fd9f8df7881a0 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 19:00:15 -0400
Subject: [PATCH 41/75] Renamed directives to have GPU_ beginning

---
 src/common/include/directive_macros.fpp   |  22 +-
 src/common/m_boundary_common.fpp          |  48 +--
 src/common/m_chemistry.fpp                |  12 +-
 src/common/m_finite_differences.fpp       |   2 +-
 src/common/m_helper.fpp                   |   4 +-
 src/common/m_helper_basic.fpp             |   6 +-
 src/common/m_mpi_common.fpp               |  66 ++--
 src/common/m_phase_change.fpp             |  34 +-
 src/common/m_variables_conversion.fpp     |  98 +++---
 src/pre_process/m_assign_variables.fpp    |   4 +-
 src/pre_process/m_patches.fpp             |   8 +-
 src/simulation/include/inline_riemann.fpp |   4 +-
 src/simulation/m_acoustic_src.fpp         |  54 +--
 src/simulation/m_body_forces.fpp          |  14 +-
 src/simulation/m_boundary_conditions.fpp  |   8 +-
 src/simulation/m_bubbles.fpp              |  38 +--
 src/simulation/m_bubbles_EE.fpp           |  44 +--
 src/simulation/m_bubbles_EL.fpp           |  96 +++---
 src/simulation/m_bubbles_EL_kernels.fpp   |  16 +-
 src/simulation/m_cbc.fpp                  | 180 +++++-----
 src/simulation/m_compute_cbc.fpp          |  16 +-
 src/simulation/m_data_output.fpp          |  14 +-
 src/simulation/m_fftw.fpp                 |  26 +-
 src/simulation/m_global_parameters.fpp    | 152 ++++-----
 src/simulation/m_hyperelastic.fpp         |  36 +-
 src/simulation/m_hypoelastic.fpp          |  50 +--
 src/simulation/m_ibm.fpp                  |  52 +--
 src/simulation/m_mhd.fpp                  |  18 +-
 src/simulation/m_mpi_proxy.fpp            |  74 ++--
 src/simulation/m_qbmm.fpp                 |  58 ++--
 src/simulation/m_rhs.fpp                  | 144 ++++----
 src/simulation/m_riemann_solvers.fpp      | 390 +++++++++++-----------
 src/simulation/m_sim_helpers.fpp          |  18 +-
 src/simulation/m_start_up.fpp             |  52 +--
 src/simulation/m_surface_tension.fpp      |  28 +-
 src/simulation/m_time_steppers.fpp        |  48 +--
 src/simulation/m_viscous.fpp              | 208 ++++++------
 src/simulation/m_weno.fpp                 |  50 +--
 38 files changed, 1096 insertions(+), 1096 deletions(-)

diff --git a/src/common/include/directive_macros.fpp b/src/common/include/directive_macros.fpp
index 4e6e62f746..0ca82b305c 100644
--- a/src/common/include/directive_macros.fpp
+++ b/src/common/include/directive_macros.fpp
@@ -171,7 +171,7 @@
 #:enddef
 
 
-#:def PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
+#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
     & default='present', firstprivate=None, reduction=None, reductionOp=None, &
     & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
     & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
@@ -215,7 +215,7 @@
     $:acc_directive
 #:enddef
 
-#:def ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
+#:def GPU_ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
     #:assert isinstance(nohost, bool)
@@ -233,7 +233,7 @@
     $:acc_directive
 #:enddef
 
-#:def DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+#:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
     
     #:set copy_val = GEN_COPY_STR(copy)
 
@@ -259,7 +259,7 @@
     $:acc_directive
 #:enddef
 
-#:def LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+#:def GPU_LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
@@ -287,7 +287,7 @@
     $:acc_directive
 #:enddef
 
-#:def DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
+#:def GPU_DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
     
     #:set copy_val = GEN_COPY_STR(copy)
 
@@ -318,7 +318,7 @@
     $:acc_directive
 #:enddef
 
-#:def ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
+#:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
 
     #:set create_val = GEN_CREATE_STR(create)
@@ -332,7 +332,7 @@
     $:acc_directive
 #:enddef
 
-#:def EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
+#:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
 
     #:set delete_val = GEN_DELETE_STR(delete)
@@ -346,7 +346,7 @@
     $:acc_directive
 #:enddef
 
-#:def CACHE(cache, extraAccArgs=None)
+#:def GPU_CACHE(cache, extraAccArgs=None)
     #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
 
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
@@ -356,7 +356,7 @@
     $:acc_directive
 #:enddef
 
-#:def ATOMIC(atomic='update', extraAccArgs=None)
+#:def GPU_ATOMIC(atomic='update', extraAccArgs=None)
     #:assert isinstance(atomic, str)
     #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
 
@@ -369,7 +369,7 @@
     $:acc_directive
 #:enddef
 
-#:def UPDATE(host=None, device=None, extraAccArgs=None)
+#:def GPU_UPDATE(host=None, device=None, extraAccArgs=None)
     #:set host_val = GEN_HOST_STR(host)
 
     #:set device_val = GEN_DEVICE_STR(device)
@@ -381,7 +381,7 @@
     $:acc_directive
 #:enddef
 
-#:def WAIT(host=None, device=None, extraAccArgs=None)
+#:def GPU_WAIT(host=None, device=None, extraAccArgs=None)
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
     #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index d9f6094600..5f9f8fa16e 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -21,7 +21,7 @@ module m_boundary_common
     implicit none
 
     type(scalar_field), dimension(:, :), allocatable :: bc_buffers
-    $:DECLARE(create=["bc_buffers"])
+    $:GPU_DECLARE(create=["bc_buffers"])
 
     real(wp) :: bcxb, bcxe, bcyb, bcye, bczb, bcze
 
@@ -84,7 +84,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, -1)%sf(0, k, l)))
@@ -108,7 +108,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 1, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, 1)%sf(0, k, l)))
@@ -136,7 +136,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, -1)%sf(k, 0, l)))
@@ -162,7 +162,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 2, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, 1)%sf(k, 0, l)))
@@ -190,7 +190,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, -1)%sf(k, l, 0)))
@@ -214,7 +214,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, pb, mv, 3, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, 1)%sf(k, l, 0)))
@@ -242,7 +242,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -311,7 +311,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_symmetry
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -575,7 +575,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_periodic
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -718,7 +718,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_axis
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -781,7 +781,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_slip_wall
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -880,7 +880,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_no_slip_wall
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1015,7 +1015,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_dirichlet
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1084,7 +1084,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_qbmm_extrapolation
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -1167,7 +1167,7 @@ contains
         if (bcxb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, -1)%sf(0, k, l))
@@ -1185,7 +1185,7 @@ contains
         if (bcxe >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, 1)%sf(0, k, l))
@@ -1206,7 +1206,7 @@ contains
         if (bcyb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, -1)%sf(k, 0, l))
@@ -1224,7 +1224,7 @@ contains
         if (bcye >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, 1)%sf(k, 0, l))
@@ -1245,7 +1245,7 @@ contains
         if (bczb >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, -1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, -1)%sf(k, l, 0))
@@ -1263,7 +1263,7 @@ contains
         if (bcze >= 0) then
             call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, 1)
         else
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, 1)%sf(k, l, 0))
@@ -1283,7 +1283,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_periodic
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1341,7 +1341,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_reflective
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1423,7 +1423,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 14b3c3d1a6..4984a2248f 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -36,7 +36,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = &
                             q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
@@ -47,7 +47,7 @@ contains
                     ! cons. contxb    = \rho         (1-fluid model)
                     ! cons. momxb + i = \rho u_i
                     energy = q_cons_vf(E_idx)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do eqn = momxb, momxe
                         energy = energy - &
                                  0.5_wp*(q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z))**2._wp
@@ -73,7 +73,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = chemxb, chemxe
                         Ys(i - chemxb + 1) = q_prim_vf(i)%sf(x, y, z)
                     end do
@@ -100,12 +100,12 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        $:PARALLEL_LOOP(collapse=3, private=["Ys", "omega"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["Ys", "omega"])
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z)
                     end do
@@ -115,7 +115,7 @@ contains
 
                     call get_net_production_rates(rho, T, Ys, omega)
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do eqn = chemxb, chemxe
 
                         omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 18a0a84484..8119c1257a 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -19,7 +19,7 @@ contains
 
         real(wp) :: divergence
 
-        $:PARALLEL_LOOP(collapse=3, private=["divergence"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["divergence"])
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index a73afddda0..cbcabc0a3f 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -44,7 +44,7 @@ contains
         !! @param Rtmp is the  bubble radii
         !! @param ntmp is the output number bubble density
     pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out) :: ntmp
@@ -58,7 +58,7 @@ contains
     end subroutine s_comp_n_from_prim
 
     pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out) :: ntmp
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index d0ace9e551..b6afc36416 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -24,7 +24,7 @@ contains
     !! @param tol_input Relative error (default = 1e-6_wp).
     !! @return Result of the comparison.
     logical pure elemental function f_approx_equal(a, b, tol_input) result(res)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: a, b
         real(wp), optional, intent(in) :: tol_input
         real(wp) :: tol
@@ -47,7 +47,7 @@ contains
     !> Checks if a real(wp) variable is of default value.
     !! @param var Variable to check.
     logical pure elemental function f_is_default(var) result(res)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, dflt_real)
@@ -72,7 +72,7 @@ contains
     !> Checks if a real(wp) variable is an integer.
     !! @param var Variable to check.
     logical pure elemental function f_is_integer(var) result(res)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, real(nint(var), wp))
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 1fdc19aa0d..6ea2bc60f2 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -26,7 +26,7 @@ module m_mpi_common
     implicit none
 
     integer, private :: err_code, ierr, v_size !<
-    $:DECLARE(create=["v_size"])
+    $:GPU_DECLARE(create=["v_size"])
     !! Generic flags used to identify and report MPI errors
 
     real(wp), private, allocatable, dimension(:), target :: buff_send !<
@@ -39,10 +39,10 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    $:DECLARE(create=["buff_send","buff_recv"])
+    $:GPU_DECLARE(create=["buff_send","buff_recv"])
 
     integer :: halo_size, nVars
-    $:DECLARE(create=["halo_size","nVars"])
+    $:GPU_DECLARE(create=["halo_size","nVars"])
 
 contains
 
@@ -639,7 +639,7 @@ contains
 #ifdef MFC_MPI
 
         call nvtxStartRange("RHS-COMM-PACKBUF")
-        $:UPDATE(device=["v_size"])
+        $:GPU_UPDATE(device=["v_size"])
 
 #ifdef MFC_SIMULATION
         if (qbmm .and. .not. polytropic) then
@@ -693,7 +693,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -707,7 +707,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=4,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -722,7 +722,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -739,7 +739,7 @@ contains
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -755,7 +755,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -771,7 +771,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -789,7 +789,7 @@ contains
                     end if
 #endif
                 #:else
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -805,7 +805,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -821,7 +821,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -856,7 +856,7 @@ contains
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    $:UPDATE(host=["buff_send"])
+                    $:GPU_UPDATE(host=["buff_send"])
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
@@ -874,7 +874,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    $:UPDATE(device=["buff_recv"])
+                    $:GPU_UPDATE(device=["buff_recv"])
                     call nvtxEndRange
                 #:endif
             end if
@@ -891,7 +891,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -912,7 +912,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -927,7 +927,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -944,7 +944,7 @@ contains
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = 0, p
                             do k = -buff_size, -1
@@ -966,7 +966,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -982,7 +982,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -1001,7 +1001,7 @@ contains
 #endif
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, sys_size
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -1024,7 +1024,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1041,7 +1041,7 @@ contains
                             end do
                         end do
 
-                        $:PARALLEL_LOOP(collapse=5,private=["r"])
+                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1089,7 +1089,7 @@ contains
 #ifdef MFC_MPI
 
         nVars = num_dims + 1
-        $:UPDATE(device=["nVars"])
+        $:GPU_UPDATE(device=["nVars"])
 
         buffer_counts = (/ &
                         buff_size*nVars*(n + 1)*(p + 1), &
@@ -1131,7 +1131,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -1144,7 +1144,7 @@ contains
                     end do
 
                 #:elif mpi_dir == 2
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -1159,7 +1159,7 @@ contains
                     end do
 
                 #:else
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -1187,7 +1187,7 @@ contains
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    $:UPDATE(host=["buff_send"])
+                    $:GPU_UPDATE(host=["buff_send"])
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
@@ -1205,7 +1205,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    $:UPDATE(device=["buff_recv"])
+                    $:GPU_UPDATE(device=["buff_recv"])
                     call nvtxEndRange
                 #:endif
             end if
@@ -1221,7 +1221,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -1241,7 +1241,7 @@ contains
                     end do
 
                 #:elif mpi_dir == 2
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = 0, p
                             do k = -buff_size, -1
@@ -1263,7 +1263,7 @@ contains
 
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    $:PARALLEL_LOOP(collapse=4,private=["r"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
                     do i = 1, nVars
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 5bbb8c4e8c..15f11be298 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -41,7 +41,7 @@ module m_phase_change
     real(wp) :: A, B, C, D
     !> @}
 
-    $:DECLARE(create=["max_iter","pCr","TCr","mixM","lp","vp","A","B","C","D"])
+    $:GPU_DECLARE(create=["max_iter","pCr","TCr","mixM","lp","vp","A","B","C","D"])
 
 contains
 
@@ -88,17 +88,17 @@ contains
         real(wp) :: rho, rM, m1, m2, MCT !< total density, total reacting mass, individual reacting masses
         real(wp) :: TvF !< total volume fraction
 
-        $:DECLARE(create=["pS","pSOV","pSSL","TS","TSOV","TSSL","TSatOV","TSatSL"])
-        $:DECLARE(create=["rhoe","dynE","rhos","rho","rM","m1","m2","MCT","TvF"])
+        $:GPU_DECLARE(create=["pS","pSOV","pSSL","TS","TSOV","TSSL","TSatOV","TSatSL"])
+        $:GPU_DECLARE(create=["rhoe","dynE","rhos","rho","rM","m1","m2","MCT","TvF"])
 
         real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
-        $:DECLARE(create=["p_infOV","p_infpT","p_infSL","sk","hk","gk","ek","rhok"])
+        $:GPU_DECLARE(create=["p_infOV","p_infpT","p_infSL","sk","hk","gk","ek","rhok"])
 
         !< Generic loop iterators
         integer :: i, j, k, l
 
         ! starting equilibrium solver
-        $:PARALLEL_LOOP(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
             "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
             "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
             "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
@@ -107,7 +107,7 @@ contains
                 do l = 0, p
 
                     rho = 0.0_wp; TvF = 0.0_wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_fluids
 
                         ! Mixture density
@@ -133,7 +133,7 @@ contains
 
                     ! kinetic energy as an auxiliary variable to the calculation of the total internal energy
                     dynE = 0.0_wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = momxb, momxe
 
                         dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho
@@ -255,7 +255,7 @@ contains
 
                     ! calculating volume fractions, internal energies, and total entropy
                     rhos = 0.0_wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_fluids
 
                         ! volume fractions
@@ -290,7 +290,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         ! initializing variables
@@ -307,7 +307,7 @@ contains
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT = ps_inf; 
         ! Performing tests before initializing the pT-equilibrium
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_fluids
 
             ! sum of the total alpha*rho*cp of the system
@@ -356,7 +356,7 @@ contains
 
             ! updating functions used in the Newton's solver
             gpp = 0.0_wp; gp = 0.0_wp; hp = 0.0_wp
-            $:LOOP()
+            $:GPU_LOOP()
             do i = 1, num_fluids
 
                 gp = gp + (gs_min(i) - 1.0_wp)*q_cons_vf(i + contxb - 1)%sf(j, k, l)*cvs(i) &
@@ -394,7 +394,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         integer, intent(in) :: j, k, l
@@ -450,7 +450,7 @@ contains
             mCP = 0.0_wp; mCPD = 0.0_wp; mCVGP = 0.0_wp; mCVGP2 = 0.0_wp; mQ = 0.0_wp; mQD = 0.0_wp
             ! Those must be updated through the iterations, as they either depend on
             ! the partial masses for all fluids, or on the equilibrium pressure
-            $:LOOP()
+            $:GPU_LOOP()
             do i = 1, num_fluids
 
                 ! sum of the total alpha*rho*cp of the system
@@ -519,7 +519,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_correct_partial_densities
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         !> @name variables for the correction of the reacting partial densities
@@ -582,7 +582,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_jacobian_matrix
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
@@ -689,7 +689,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pTg_residue
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         integer, intent(in) :: j, k, l
@@ -740,7 +740,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_TSat
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(in) :: pSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index a23b5ff2ae..f7d5e9075a 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -51,16 +51,16 @@ module m_variables_conversion
     !! In simulation, gammas, pi_infs, and qvs are already declared in m_global_variables
 #ifndef MFC_SIMULATION
     real(wp), allocatable, public, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    $:DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
+    $:GPU_DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
 #endif
 
     real(wp), allocatable, dimension(:) :: Gs
     integer, allocatable, dimension(:) :: bubrs
     real(wp), allocatable, dimension(:, :) :: Res
-    $:DECLARE(create=["bubrs","Gs","Res"])
+    $:GPU_DECLARE(create=["bubrs","Gs","Res"])
 
     integer :: is1b, is2b, is3b, is1e, is2e, is3e
-    $:DECLARE(create=["is1b","is2b","is3b","is1e","is2e","is3e"])
+    $:GPU_DECLARE(create=["is1b","is2b","is3b","is1e","is2e","is3e"])
 
     real(wp), allocatable, dimension(:, :, :), public :: rho_sf !< Scalar density function
     real(wp), allocatable, dimension(:, :, :), public :: gamma_sf !< Scalar sp. heat ratio function
@@ -121,7 +121,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pressure
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(in) :: energy, alf
@@ -462,7 +462,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -543,7 +543,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -643,7 +643,7 @@ contains
             qvs(i) = fluid_pp(i)%qv
             qvps(i) = fluid_pp(i)%qvp
         end do
-        $:UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
+        $:GPU_UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
 
 #ifdef MFC_SIMULATION
 
@@ -655,7 +655,7 @@ contains
                 end do
             end do
 
-            $:UPDATE(device=["Res","Re_idx","Re_size"])
+            $:GPU_UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 #endif
 
@@ -669,7 +669,7 @@ contains
             do i = 1, nb
                 bubrs(i) = bub_idx%rs(i)
             end do
-            $:UPDATE(device=["bubrs"])
+            $:GPU_UPDATE(device=["bubrs"])
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -749,7 +749,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -782,7 +782,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -870,7 +870,7 @@ contains
             end if
         #:endif
 
-        $:PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
             "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
             "dyn_pres_K", "rhoYks", "B"])
         do l = ibounds(3)%beg, ibounds(3)%end
@@ -878,7 +878,7 @@ contains
                 do j = ibounds(1)%beg, ibounds(1)%end
                     dyn_pres_K = 0._wp
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_fluids
                         alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l)
                         alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -922,13 +922,13 @@ contains
                         B2 = B(1)**2 + B(2)**2 + B(3)**2
 
                         m2 = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = momxb, momxe
                             m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2
                         end do
 
                         S = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, 3
                             S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i)
                         end do
@@ -936,14 +936,14 @@ contains
                         E = qK_cons_vf(E_idx)%sf(j, k, l)
 
                         D = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             D = D + qK_cons_vf(i)%sf(j, k, l)
                         end do
 
                         ! Newton-Raphson
                         W = E + D
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do iter = 1, relativity_cons_to_prim_max_iter
                             Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2)))
                             pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS
@@ -969,13 +969,13 @@ contains
                         qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2)
 
                         ! Recover the other primitive variables
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, 3
                             qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2)
                         end do
                         qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -985,22 +985,22 @@ contains
 
                     if (chemistry) then
                         rho_K = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = chemxb, chemxe
                             rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l))
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = rho_K
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = chemxb, chemxe
                             qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K)
                         end do
                     else
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -1010,7 +1010,7 @@ contains
                     rho_K = max(rho_K, sgm_eps)
 #endif
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = momxb, momxe
                         if (model_eqns /= 4) then
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) &
@@ -1024,7 +1024,7 @@ contains
                     end do
 
                     if (chemistry) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_species
                             rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l)
                         end do
@@ -1054,7 +1054,7 @@ contains
                     end if
 
                     if (bubbles_euler) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, nb
                             nRtmp(i) = qK_cons_vf(bubrs(i))%sf(j, k, l)
                         end do
@@ -1066,7 +1066,7 @@ contains
                             nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
                             !Convert cons to prim
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1083,7 +1083,7 @@ contains
                                 call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight)
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1091,21 +1091,21 @@ contains
                     end if
 
                     if (mhd) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
                     end if
 
                     if (elasticity) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = strxb, strxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
                     if (hypoelasticity) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = strxb, strxe
                             ! subtracting elastic contribution for pressure calculation
                             if (G_K > verysmall) then
@@ -1122,13 +1122,13 @@ contains
                     end if
 
                     if (hyperelasticity) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = xibeg, xiend
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = advxb, advxe
                         qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                     end do
@@ -1462,33 +1462,33 @@ contains
         is2b = is2%beg; is2e = is2%end
         is3b = is3%beg; is3e = is3%end
 
-        $:UPDATE(device=["is1b","is2b","is3b","is1e","is2e","is3e"])
+        $:GPU_UPDATE(device=["is1b","is2b","is3b","is1e","is2e","is3e"])
 
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
-        $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_K", "vel_K", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_K", "vel_K", &
             "alpha_K", "Re_K", "Y_K"])
         do l = is3b, is3e
             do k = is2b, is2e
                 do j = is1b, is1e
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, contxe
                         alpha_rho_K(i) = qK_prim_vf(j, k, l, i)
                     end do
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = advxb, advxe
                         alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i)
                     end do
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_vels
                         vel_K(i) = qK_prim_vf(j, k, l, contxe + i)
                     end do
 
                     vel_K_sum = 0._wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_vels
                         vel_K_sum = vel_K_sum + vel_K(i)**2._wp
                     end do
@@ -1509,7 +1509,7 @@ contains
                     ! Computing the energy from the pressure
 
                     if (chemistry) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = chemxb, chemxe
                             Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i)
                         end do
@@ -1526,12 +1526,12 @@ contains
                     end if
 
                     ! mass flux, this should be \alpha_i \rho_i u_i
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, contxe
                         FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1))
                     end do
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_vels
                         FK_vf(j, k, l, contxe + dir_idx(i)) = &
                             rho_K*vel_K(dir_idx(1)) &
@@ -1544,14 +1544,14 @@ contains
 
                     ! Species advection Flux, \rho*u*Y
                     if (chemistry) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_species
                             FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i))
                         end do
                     end if
 
                     if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = 0._wp
                             FK_src_vf(j, k, l, i) = alpha_K(i - E_idx)
@@ -1559,12 +1559,12 @@ contains
 
                     else
                         ! Could be bubbles_euler!
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx)
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = advxb, advxe
                             FK_src_vf(j, k, l, i) = vel_K(dir_idx(1))
                         end do
@@ -1604,7 +1604,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_speed_of_sound
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(in) :: pres
@@ -1638,7 +1638,7 @@ contains
                 c = (1._wp/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
             elseif (model_eqns == 3) then
                 c = 0._wp
-                $:LOOP()
+                $:GPU_LOOP()
                 do q = 1, num_fluids
                     c = c + adv(q)*(1._wp/gammas(q) + 1._wp)* &
                         (pres + pi_infs(q)/(gammas(q) + 1._wp))
@@ -1674,7 +1674,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(in) :: B(3), rho, c
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index 1adfd6ab1f..0f77df808d 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -104,7 +104,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, &
                                                                eta, q_prim_vf, patch_id_fp)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
@@ -277,7 +277,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     impure subroutine s_assign_patch_species_primitive_variables(patch_id, j, k, l, &
                                                                  eta, q_prim_vf, patch_id_fp)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index fe861c4c06..2dfefe6623 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -2382,7 +2382,7 @@ contains
     end subroutine s_model
 
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         real(wp), intent(in) :: cyl_y, cyl_z
 
@@ -2393,7 +2393,7 @@ contains
 
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         t_vec3, intent(in) :: cyl
         t_vec3 :: cart
@@ -2405,7 +2405,7 @@ contains
     end function f_convert_cyl_to_cart
 
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         real(wp), intent(IN) :: cyl_x, cyl_y
 
@@ -2418,7 +2418,7 @@ contains
     !! @param offset Thickness
     !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: myth, offset, a
         real(wp) :: b
         real(wp) :: f_r
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index 81e311141a..b0ed99390b 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -1,7 +1,7 @@
 #:def arithmetic_avg()
     rho_avg = 5e-1_wp*(rho_L + rho_R)
     vel_avg_rms = 0._wp
-    $:LOOP()
+    $:GPU_LOOP()
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
     end do
@@ -17,7 +17,7 @@
 
     vel_avg_rms = 0._wp
 
-    $:LOOP()
+    $:GPU_LOOP()
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (sqrt(rho_L)*vel_L(i) + sqrt(rho_R)*vel_R(i))**2._wp/ &
                       (sqrt(rho_L) + sqrt(rho_R))**2._wp
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index d7186d75a3..3e1dea3e5e 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -24,43 +24,43 @@ module m_acoustic_src
     private; public :: s_initialize_acoustic_src, s_precalculate_acoustic_spatial_sources, s_acoustic_src_calculations
 
     integer, allocatable, dimension(:) :: pulse, support
-    $:DECLARE(create=["pulse","support"])
+    $:GPU_DECLARE(create=["pulse","support"])
 
     logical, allocatable, dimension(:) :: dipole
-    $:DECLARE(create=["dipole"])
+    $:GPU_DECLARE(create=["dipole"])
 
     real(wp), allocatable, target, dimension(:, :) :: loc_acoustic
-    $:DECLARE(create=["loc_acoustic"])
+    $:GPU_DECLARE(create=["loc_acoustic"])
 
     real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency
     real(wp), allocatable, dimension(:) :: gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
-    $:DECLARE(create=["mag","length","height","wavelength","frequency"])
-    $:DECLARE(create=["gauss_sigma_dist","gauss_sigma_time","npulse","dir","delay"])
+    $:GPU_DECLARE(create=["mag","length","height","wavelength","frequency"])
+    $:GPU_DECLARE(create=["gauss_sigma_dist","gauss_sigma_time","npulse","dir","delay"])
 
     real(wp), allocatable, dimension(:) :: foc_length, aperture
-    $:DECLARE(create=["foc_length","aperture"])
+    $:GPU_DECLARE(create=["foc_length","aperture"])
 
     real(wp), allocatable, dimension(:) :: element_spacing_angle, element_polygon_ratio, rotate_angle
-    $:DECLARE(create=["element_spacing_angle","element_polygon_ratio","rotate_angle"])
+    $:GPU_DECLARE(create=["element_spacing_angle","element_polygon_ratio","rotate_angle"])
 
     real(wp), allocatable, dimension(:) :: bb_bandwidth, bb_lowest_freq
-    $:DECLARE(create=["bb_bandwidth","bb_lowest_freq"])
+    $:GPU_DECLARE(create=["bb_bandwidth","bb_lowest_freq"])
 
     integer, allocatable, dimension(:) :: num_elements, element_on, bb_num_freq
-    $:DECLARE(create=["num_elements","element_on","bb_num_freq"])
+    $:GPU_DECLARE(create=["num_elements","element_on","bb_num_freq"])
 
     !> @name Acoustic source terms
     !> @{
     real(wp), allocatable, dimension(:, :, :) :: mass_src, e_src
     real(wp), allocatable, dimension(:, :, :, :) :: mom_src
     !> @}
-    $:DECLARE(create=["mass_src","e_src","mom_src"])
+    $:GPU_DECLARE(create=["mass_src","e_src","mom_src"])
 
     integer, dimension(:), allocatable :: source_spatials_num_points !< Number of non-zero source grid points for each source
-    $:DECLARE(create=["source_spatials_num_points"])
+    $:GPU_DECLARE(create=["source_spatials_num_points"])
 
     type(source_spatial_type), dimension(:), allocatable :: source_spatials !< Data of non-zero source grid points for each source
-    $:DECLARE(create=["source_spatials"])
+    $:GPU_DECLARE(create=["source_spatials"])
 
 contains
 
@@ -111,7 +111,7 @@ contains
                 delay(i) = acoustic(i)%delay
             end if
         end do
-        $:UPDATE(device=["loc_acoustic","mag","dipole","support","length", &
+        $:GPU_UPDATE(device=["loc_acoustic","mag","dipole","support","length", &
             & "height","wavelength","frequency","gauss_sigma_dist", &
             & "gauss_sigma_time","foc_length","aperture","npulse","pulse", &
             & "dir","delay","element_polygon_ratio","rotate_angle", &
@@ -167,7 +167,7 @@ contains
 
         sim_time = t_step*dt
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -220,7 +220,7 @@ contains
 
             deallocate (phi_rn)
 
-            $:PARALLEL_LOOP(private=["myalpha","myalpha_rho"])
+            $:GPU_PARALLEL_LOOP(private=["myalpha","myalpha_rho"])
             do i = 1, num_points
                 j = source_spatials(ai)%coord(1, i)
                 k = source_spatials(ai)%coord(2, i)
@@ -239,7 +239,7 @@ contains
 
                 if (bubbles_euler) then
                     if (num_fluids > 2) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, num_fluids - 1
                             myRho = myRho + myalpha_rho(q)
                             B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -253,7 +253,7 @@ contains
                 end if
 
                 if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = 1, num_fluids
                         myRho = myRho + myalpha_rho(q)
                         B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -320,15 +320,15 @@ contains
         end do
 
         ! Update the rhs variables
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = contxb, contxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l)
                     end do
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = momxb, momxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l)
                     end do
@@ -347,7 +347,7 @@ contains
     !! @param gauss_sigma_time_local sigma in time for Gaussian pulse
     !! @param source Source term amplitude
     pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         integer, intent(in) :: ai, term_index
         real(wp), intent(in) :: sim_time, c, sum_BB
         real(wp), intent(in) :: frequency_local, gauss_sigma_time_local
@@ -474,14 +474,14 @@ contains
                 call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
             end if
 
-            $:UPDATE(device=["source_spatials(ai)%coord"])
-            $:UPDATE(device=["source_spatials(ai)%val"])
+            $:GPU_UPDATE(device=["source_spatials(ai)%coord"])
+            $:GPU_UPDATE(device=["source_spatials(ai)%val"])
             if (support(ai) >= 5) then
                 if (dim == 2) then
-                    $:UPDATE(device=["source_spatials(ai)%angle"])
+                    $:GPU_UPDATE(device=["source_spatials(ai)%angle"])
                 end if
                 if (dim == 3) then
-                    $:UPDATE(device=["source_spatials(ai)%xyz_to_r_ratios"])
+                    $:GPU_UPDATE(device=["source_spatials(ai)%xyz_to_r_ratios"])
                 end if
             end if
 
@@ -699,7 +699,7 @@ contains
     !! @param c Speed of sound
     !! @return frequency_local Converted frequency
     pure elemental function f_frequency_local(freq_conv_flag, ai, c)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         logical, intent(in) :: freq_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
@@ -718,7 +718,7 @@ contains
     !! @param ai Acoustic source index
     !! @return gauss_sigma_time_local Converted Gaussian sigma time
     pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         logical, intent(in) :: gauss_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index b0d7cda444..d761141a0d 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -23,7 +23,7 @@ module m_body_forces
               s_finalize_body_forces_module
 
     real(wp), allocatable, dimension(:, :, :) :: rhoM
-    $:DECLARE(create=["rhoM"])
+    $:GPU_DECLARE(create=["rhoM"])
 
 contains
 
@@ -68,7 +68,7 @@ contains
             end if
         end if
 
-        $:UPDATE(device=["accel_bf"])
+        $:GPU_UPDATE(device=["accel_bf"])
 
     end subroutine s_compute_acceleration
 
@@ -80,7 +80,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer :: i, j, k, l !< standard iterators
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -110,7 +110,7 @@ contains
         call s_compute_acceleration(mytime)
         call s_compute_mixture_density(q_cons_vf)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
@@ -123,7 +123,7 @@ contains
 
         if (bf_x) then ! x-direction body forces
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -138,7 +138,7 @@ contains
 
         if (bf_y) then ! y-direction body forces
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -153,7 +153,7 @@ contains
 
         if (bf_z) then ! z-direction body forces
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index ad50b559ac..f7408c43de 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -40,7 +40,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_type(dir, loc)%sf
-                $:UPDATE(device=["bc_type(dir, loc)%sf"])
+                $:GPU_UPDATE(device=["bc_type(dir, loc)%sf"])
             end do
         end do
         close (1)
@@ -56,7 +56,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_buffers(dir, loc)%sf
-                $:UPDATE(device=["bc_buffers(dir, loc)%sf"])
+                $:GPU_UPDATE(device=["bc_buffers(dir, loc)%sf"])
             end do
         end do
         close (1)
@@ -104,7 +104,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), MPI_INTEGER, MPI_BC_TYPE_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_type(dir, loc)%sf, 1, MPI_BC_TYPE_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_type(dir, loc)%sf)
-                $:UPDATE(device=["bc_type(dir, loc)%sf"])
+                $:GPU_UPDATE(device=["bc_type(dir, loc)%sf"])
             end do
         end do
 
@@ -114,7 +114,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_p, MPI_BC_BUFFER_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_buffers(dir, loc)%sf, 1, MPI_BC_BUFFER_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_buffers(dir, loc)%sf)
-                $:UPDATE(device=["bc_buffers(dir, loc)%sf"])
+                $:GPU_UPDATE(device=["bc_buffers(dir, loc)%sf"])
             end do
         end do
 
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 54fc7fcc47..3d9341ef0a 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -21,7 +21,7 @@ module m_bubbles
     real(wp) :: chi_vw  !< Bubble wall properties (Ando 2010)
     real(wp) :: k_mw    !< Bubble wall properties (Ando 2010)
     real(wp) :: rho_mw  !< Bubble wall properties (Ando 2010)
-    $:DECLARE(create=["chi_vw","k_mw","rho_mw"])
+    $:GPU_DECLARE(create=["chi_vw","k_mw","rho_mw"])
 
 contains
 
@@ -40,7 +40,7 @@ contains
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
     pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(in) :: fCson
@@ -70,7 +70,7 @@ contains
         else if (bubble_model == 3) then
             ! Rayleigh-Plesset bubbles
             fCpbw = f_cpbw_KM(fR0, fR, fV, fpb)
-            f_rddot = f_rddot_RP(fP, fRho, fR, fV, fCpbw)
+            f_rddot = f_rddot_RP(fP, fRho, fR, fV, fR0, fCpbw)
         end if
 
     end function f_rddot
@@ -81,7 +81,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw(fR0, fR, fV, fpb)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
         real(wp) :: f_cpbw
@@ -100,7 +100,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
         real(wp) :: tmp1, tmp2, tmp3
@@ -120,7 +120,7 @@ contains
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
     pure elemental function f_cgas(fCpinf, fntait, fBtait, fH)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
         real(wp) :: tmp
@@ -143,7 +143,7 @@ contains
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
     pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
         real(wp) :: c2_liquid
@@ -173,7 +173,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
     pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
 
@@ -209,7 +209,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fCpbw Boundary wall pressure
     pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fR0, fCpbw)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fCp, fRho, fR, fV, fR0, fCpbw
 
         real(wp) :: f_rddot_RP
@@ -232,7 +232,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
 
@@ -255,7 +255,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw_KM(fR0, fR, fV, fpb)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
 
@@ -282,7 +282,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
     pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
 
@@ -316,7 +316,7 @@ contains
         !!  @param pb Internal bubble pressure
         !!  @param iR0 Current bubble size index
     pure elemental subroutine s_bwproperty(pb, iR0, chi_vw, k_mw, rho_mw)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: pb
         integer, intent(in) :: iR0
         real(wp), intent(out) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -346,7 +346,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
         real(wp), intent(in) :: fpb
@@ -404,7 +404,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -467,7 +467,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_step
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
@@ -601,7 +601,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_initial_substep_h
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -687,7 +687,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_substep
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
@@ -785,7 +785,7 @@ contains
         !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
     pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
                                            fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
         real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t
         integer, intent(IN) :: bub_id
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index d6c2b0212c..c522af5e27 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -22,13 +22,13 @@ module m_bubbles_EE
 
     real(wp), allocatable, dimension(:, :, :) :: bub_adv_src
     real(wp), allocatable, dimension(:, :, :, :) :: bub_r_src, bub_v_src, bub_p_src, bub_m_src
-    $:DECLARE(create=["bub_adv_src","bub_r_src","bub_v_src","bub_p_src","bub_m_src"])
+    $:GPU_DECLARE(create=["bub_adv_src","bub_r_src","bub_v_src","bub_p_src","bub_m_src"])
 
     type(scalar_field) :: divu !< matrix for div(u)
-    $:DECLARE(create=["divu"])
+    $:GPU_DECLARE(create=["divu"])
 
     integer, allocatable, dimension(:) :: rs, vs, ms, ps
-    $:DECLARE(create=["rs","vs","ms","ps"])
+    $:GPU_DECLARE(create=["rs","vs","ms","ps"])
 
 contains
 
@@ -52,9 +52,9 @@ contains
             end if
         end do
 
-        $:UPDATE(device=["rs", "vs"])
+        $:GPU_UPDATE(device=["rs", "vs"])
         if (.not. polytropic) then
-            $:UPDATE(device=["ps", "ms"])
+            $:GPU_UPDATE(device=["ps", "ms"])
         end if
 
         @:ALLOCATE(divu%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@@ -77,12 +77,12 @@ contains
         real(wp) :: nR3bar
         integer(wp) :: i, j, k, l
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
                     nR3bar = 0._wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, nb
                         nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp
                     end do
@@ -104,7 +104,7 @@ contains
         if (idir == 1) then
 
             if (.not. qbmm) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -120,7 +120,7 @@ contains
 
         elseif (idir == 2) then
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -134,7 +134,7 @@ contains
 
         elseif (idir == 3) then
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -173,13 +173,13 @@ contains
         integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
                     bub_adv_src(j, k, l) = 0._wp
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = 1, nb
                         bub_r_src(j, k, l, q) = 0._wp
                         bub_v_src(j, k, l, q) = 0._wp
@@ -191,7 +191,7 @@ contains
         end do
 
         adap_dt_stop_max = 0
-        $:PARALLEL_LOOP(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
             reduction=["adap_dt_stop_max"], reductionOp="MAX", &
             copy=["adap_dt_stop_max"])
         do l = 0, p
@@ -201,7 +201,7 @@ contains
                     if (adv_n) then
                         nbub = q_prim_vf(n_idx)%sf(j, k, l)
                     else
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, nb
                             Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l)
                             Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l)
@@ -209,7 +209,7 @@ contains
 
                         R3 = 0._wp
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, nb
                             R3 = R3 + weight(q)*Rtmp(q)**3._wp
                         end do
@@ -220,7 +220,7 @@ contains
                     if (.not. adap_dt) then
                         R2Vav = 0._wp
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, nb
                             R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q)
                         end do
@@ -228,10 +228,10 @@ contains
                         bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav
                     end if
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = 1, nb
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do ii = 1, num_fluids
                             myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l)
                             myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l)
@@ -242,14 +242,14 @@ contains
                         B_tait = 0._wp
 
                         if (mpp_lim .and. (num_fluids > 2)) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do ii = 1, num_fluids
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
                                 B_tait = B_tait + myalpha(ii)*pi_infs(ii)
                             end do
                         else if (num_fluids > 2) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do ii = 1, num_fluids - 1
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
@@ -325,14 +325,14 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l)
                         if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = &
                             rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l)
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do k = 1, nb
                             rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k)
                             rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index c455426180..3240a9d9a8 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -33,26 +33,26 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:) :: bub_R0            !< Initial bubble radius
     real(wp), allocatable, dimension(:) :: Rmax_stats        !< Maximum radius
     real(wp), allocatable, dimension(:) :: Rmin_stats        !< Minimum radius
-    $:DECLARE(create=["lag_id", "bub_R0", "Rmax_stats", "Rmin_stats"])
+    $:GPU_DECLARE(create=["lag_id", "bub_R0", "Rmax_stats", "Rmin_stats"])
 
     real(wp), allocatable, dimension(:) :: gas_mg            !< Bubble's gas mass
     real(wp), allocatable, dimension(:) :: gas_betaT         !< heatflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: gas_betaC         !< massflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: bub_dphidt        !< subgrid velocity potential (Maeda & Colonius, 2018)
-    $:DECLARE(create=["gas_mg", "gas_betaT", "gas_betaC", "bub_dphidt"])
+    $:GPU_DECLARE(create=["gas_mg", "gas_betaT", "gas_betaC", "bub_dphidt"])
 
     !(nBub, 1 -> actual val or 2 -> temp val)
     real(wp), allocatable, dimension(:, :) :: gas_p          !< Pressure in the bubble
     real(wp), allocatable, dimension(:, :) :: gas_mv         !< Vapor mass in the bubble
     real(wp), allocatable, dimension(:, :) :: intfc_rad      !< Bubble radius
     real(wp), allocatable, dimension(:, :) :: intfc_vel      !< Velocity of the bubble interface
-    $:DECLARE(create=["gas_p", "gas_mv", "intfc_rad", "intfc_vel"])
+    $:GPU_DECLARE(create=["gas_p", "gas_mv", "intfc_rad", "intfc_vel"])
     !(nBub, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
     real(wp), allocatable, dimension(:, :, :) :: mtn_pos     !< Bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_posPrev !< Bubble's previous position
     real(wp), allocatable, dimension(:, :, :) :: mtn_vel     !< Bubble's velocity
     real(wp), allocatable, dimension(:, :, :) :: mtn_s       !< Bubble's computational cell position in real format
-    $:DECLARE(create=["mtn_pos", "mtn_posPrev", "mtn_vel", "mtn_s"])
+    $:GPU_DECLARE(create=["mtn_pos", "mtn_posPrev", "mtn_vel", "mtn_s"])
     !(nBub, 1-> x or 2->y or 3 ->z, time-stage)
     real(wp), allocatable, dimension(:, :) :: intfc_draddt   !< Time derivative of bubble's radius
     real(wp), allocatable, dimension(:, :) :: intfc_dveldt   !< Time derivative of bubble's interface velocity
@@ -60,18 +60,18 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:, :) :: gas_dmvdt      !< Time derivative of the vapor mass in the bubble
     real(wp), allocatable, dimension(:, :, :) :: mtn_dposdt  !< Time derivative of the bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_dveldt  !< Time derivative of the bubble's velocity
-    $:DECLARE(create=["intfc_draddt", "intfc_dveldt", "gas_dpdt", "gas_dmvdt", "mtn_dposdt", "mtn_dveldt"])
+    $:GPU_DECLARE(create=["intfc_draddt", "intfc_dveldt", "gas_dpdt", "gas_dmvdt", "mtn_dposdt", "mtn_dveldt"])
 
     integer, private :: lag_num_ts                                  !<  Number of time stages in the time-stepping scheme
 
-    $:DECLARE(create=["lag_num_ts"])
+    $:GPU_DECLARE(create=["lag_num_ts"])
 
     integer :: nBubs                            !< Number of bubbles in the local domain
     real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
     type(vector_field) :: q_beta                !< Projection of the lagrangian particles in the Eulerian framework
     integer :: q_beta_idx                       !< Size of the q_beta vector field
 
-    $:DECLARE(create=["nBubs","Rmax_glb","Rmin_glb","q_beta","q_beta_idx"])
+    $:GPU_DECLARE(create=["nBubs","Rmax_glb","Rmin_glb","q_beta","q_beta_idx"])
 
 contains
 
@@ -101,7 +101,7 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
-        $:UPDATE(device=["lag_num_ts", "q_beta_idx"])
+        $:GPU_UPDATE(device=["lag_num_ts", "q_beta_idx"])
 
         @:ALLOCATE(q_beta%vf(1:q_beta_idx))
 
@@ -249,9 +249,9 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
-        $:UPDATE(device=["bubbles_lagrange", "lag_params"])
+        $:GPU_UPDATE(device=["bubbles_lagrange", "lag_params"])
 
-        $:UPDATE(device=["lag_id","bub_R0","Rmax_stats","Rmin_stats","gas_mg", &
+        $:GPU_UPDATE(device=["lag_id","bub_R0","Rmax_stats","Rmin_stats","gas_mg", &
             & "gas_betaT","gas_betaC","bub_dphidt","gas_p","gas_mv", &
             & "intfc_rad","intfc_vel","mtn_pos","mtn_posPrev","mtn_vel", &
             & "mtn_s","intfc_draddt","intfc_dveldt","gas_dpdt","gas_dmvdt", &
@@ -259,9 +259,9 @@ contains
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
-        $:UPDATE(device=["Rmax_glb", "Rmin_glb"])
+        $:GPU_UPDATE(device=["Rmax_glb", "Rmin_glb"])
 
-        $:UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
+        $:GPU_UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
 
         !Populate temporal variables
         call s_transfer_data_to_tmp()
@@ -528,7 +528,7 @@ contains
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
             ! Calculate velocity potentials (valid for one bubble per cell)
-            $:PARALLEL_LOOP(private=["k","cell"])
+            $:GPU_PARALLEL_LOOP(private=["k","cell"])
             do k = 1, nBubs
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
@@ -549,7 +549,7 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        $:PARALLEL_LOOP(private=["k","myalpha_rho","myalpha","Re","cell"], &
+        $:GPU_PARALLEL_LOOP(private=["k","myalpha_rho","myalpha","Re","cell"], &
             & reduction=["adap_dt_stop_max"],reductionOp="MAX", &
             & copy=["adap_dt_stop_max"],copyin=["stage"])
         do k = 1, nBubs
@@ -574,7 +574,7 @@ contains
             call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2)
 
             ! Obtain liquid density and computing speed of sound from pinf
-            $:LOOP()
+            $:GPU_LOOP()
             do i = 1, num_fluids
                 myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3))
                 myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3))
@@ -619,7 +619,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         ! Bubbles remain in a fixed position
-        $:PARALLEL_LOOP(collapse=2, private=["k"], copyin=["stage"])
+        $:GPU_PARALLEL_LOOP(collapse=2, private=["k"], copyin=["stage"])
         do k = 1, nBubs
             do l = 1, 3
                 mtn_dposdt(k, l, stage) = 0._wp
@@ -649,7 +649,7 @@ contains
         if (lag_params%solver_approach == 2) then
 
             if (p == 0) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -665,7 +665,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -685,7 +685,7 @@ contains
 
                 call s_gradient_dir(q_prim_vf(E_idx), q_beta%vf(3), l)
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -700,7 +700,7 @@ contains
                 end do
 
                 !source in energy
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(2)%beg, idwbuff(2)%end
                         do i = idwbuff(1)%beg, idwbuff(1)%end
@@ -711,7 +711,7 @@ contains
 
                 call s_gradient_dir(q_beta%vf(3), q_beta%vf(4), l)
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -742,7 +742,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_cson_from_pinf
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
@@ -753,7 +753,7 @@ contains
         real(wp), dimension(num_dims) :: vel
         integer :: i
 
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_dims
             vel(i) = q_prim_vf(i + contxe)%sf(cell(1), cell(2), cell(3))
         end do
@@ -770,7 +770,7 @@ contains
 
         call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -785,7 +785,7 @@ contains
                               mtn_s, mtn_pos, q_beta)
 
         !Store 1-beta
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -812,7 +812,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_pinf
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
@@ -834,7 +834,7 @@ contains
 
         !< Find current bubble cell
         cell(:) = int(scoord(:))
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_dims
             if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
         end do
@@ -925,11 +925,11 @@ contains
             charpres2 = 0._wp
             vol = 0._wp
 
-            $:LOOP()
+            $:GPU_LOOP()
             do i = 1, smearGrid
-                $:LOOP()
+                $:GPU_LOOP()
                 do j = 1, smearGrid
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do k = 1, smearGridz
                         cellaux(1) = cell(1) + i - (mapCells + 1)
                         cellaux(2) = cell(2) + j - (mapCells + 1)
@@ -1028,7 +1028,7 @@ contains
         integer :: k
 
         if (time_stepper == 1) then ! 1st order TVD RK
-            $:PARALLEL_LOOP(private=["k"])
+            $:GPU_PARALLEL_LOOP(private=["k"])
             do k = 1, nBubs
                 !u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1044,13 +1044,13 @@ contains
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
             if (lag_params%write_bubbles) then
-                $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                 call s_write_lag_particles(mytime)
             end if
 
         elseif (time_stepper == 2) then ! 2nd order TVD RK
             if (stage == 1) then
-                $:PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1062,7 +1062,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                $:PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
@@ -1078,7 +1078,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1086,7 +1086,7 @@ contains
 
         elseif (time_stepper == 3) then ! 3rd order TVD RK
             if (stage == 1) then
-                $:PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1098,7 +1098,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                $:PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
@@ -1109,7 +1109,7 @@ contains
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
                 end do
             elseif (stage == 3) then
-                $:PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private=["k"])
                 do k = 1, nBubs
                     !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3))
@@ -1125,7 +1125,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1195,7 +1195,7 @@ contains
 
         integer :: k
 
-        $:PARALLEL_LOOP(private=["k"])
+        $:GPU_PARALLEL_LOOP(private=["k"])
         do k = 1, nBubs
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
@@ -1294,7 +1294,7 @@ contains
 
         if (dir == 1) then
             ! Gradient in x dir.
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 0, p
                 do j = 0, n
                     do i = 0, m
@@ -1309,7 +1309,7 @@ contains
         else
             if (dir == 2) then
                 ! Gradient in y dir.
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1323,7 +1323,7 @@ contains
                 end do
             else
                 ! Gradient in z dir.
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1417,7 +1417,7 @@ contains
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
-        $:PARALLEL_LOOP(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
+        $:GPU_PARALLEL_LOOP(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
             ["lag_void_max"]], reductionOp=["+", "MAX"], &
             copy=["lag_vol", "lag_void_avg", "lag_void_max"])
         do k = 0, p
@@ -1603,7 +1603,7 @@ contains
 
         integer :: k
 
-        $:PARALLEL_LOOP(reduction=[["Rmax_glb"], ["Rmin_glb"]], &
+        $:GPU_PARALLEL_LOOP(reduction=[["Rmax_glb"], ["Rmin_glb"]], &
             & reductionOp=["MAX", "MIN"], copy=["Rmax_glb","Rmin_glb"])
         do k = 1, nBubs
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
@@ -1623,7 +1623,7 @@ contains
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
 
-        $:UPDATE(host=["Rmax_glb","Rmin_glb"])
+        $:GPU_UPDATE(host=["Rmax_glb","Rmin_glb"])
 
         open (13, FILE=trim(file_loc), FORM='formatted', position='rewind')
         write (13, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
@@ -1651,7 +1651,7 @@ contains
 
         integer :: i
 
-        $:LOOP()
+        $:GPU_LOOP()
         do i = bub_id, nBubs - 1
             lag_id(i, 1) = lag_id(i + 1, 1)
             bub_R0(i) = bub_R0(i + 1)
@@ -1676,7 +1676,7 @@ contains
         end do
 
         nBubs = nBubs - 1
-        $:UPDATE(device=["nBubs"])
+        $:GPU_UPDATE(device=["nBubs"])
 
     end subroutine s_remove_lag_bubble
 
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index ae00f42a25..aa660b2ed0 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -56,7 +56,7 @@ contains
         real(wp), dimension(3) :: s_coord
         integer :: l
 
-        $:PARALLEL_LOOP(private=["l","s_coord","cell"])
+        $:GPU_PARALLEL_LOOP(private=["l","s_coord","cell"])
         do l = 1, nBubs
 
             volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
@@ -121,7 +121,7 @@ contains
         smearGridz = smearGrid
         if (p == 0) smearGridz = 1
 
-        $:PARALLEL_LOOP(private=["nodecoord","l","s_coord","cell","center"], copyin=["smearGrid","smearGridz"])
+        $:GPU_PARALLEL_LOOP(private=["nodecoord","l","s_coord","cell","center"], copyin=["smearGrid","smearGridz"])
         do l = 1, nBubs
             nodecoord(1:3) = 0
             center(1:3) = 0._wp
@@ -204,7 +204,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_applygaussian
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
@@ -274,7 +274,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_check_celloutside
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
@@ -310,7 +310,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
@@ -351,7 +351,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_stddsv
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
@@ -392,7 +392,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_char_vol
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
@@ -417,7 +417,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_cell
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 6b19251034..3c587d9735 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -49,7 +49,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
-    $:DECLARE(create=["q_prim_rsx_vf","q_prim_rsy_vf","q_prim_rsz_vf"])
+    $:GPU_DECLARE(create=["q_prim_rsx_vf","q_prim_rsy_vf","q_prim_rsz_vf"])
 
     type(scalar_field), allocatable, dimension(:) :: F_rs_vf, F_src_rs_vf !<
 
@@ -59,7 +59,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsx_vf, F_src_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsy_vf, F_src_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsz_vf, F_src_rsz_vf !<
-    $:DECLARE(create=["F_rsx_vf","F_src_rsx_vf","F_rsy_vf","F_src_rsy_vf","F_rsz_vf","F_src_rsz_vf"])
+    $:GPU_DECLARE(create=["F_rsx_vf","F_src_rsx_vf","F_rsy_vf","F_src_rsy_vf","F_rsz_vf","F_src_rsz_vf"])
 
     !! There is a CCE bug that is causing some subset of these variables to interfere
     !! with variables of the same name in m_riemann_solvers.fpp, and giving this versions
@@ -70,14 +70,14 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf_l, flux_src_rsx_vf_l !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf_l, flux_src_rsy_vf_l
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf_l, flux_src_rsz_vf_l
-    $:DECLARE(create=["flux_rsx_vf_l","flux_src_rsx_vf_l","flux_rsy_vf_l","flux_src_rsy_vf_l","flux_rsz_vf_l","flux_src_rsz_vf_l"])
+    $:GPU_DECLARE(create=["flux_rsx_vf_l","flux_src_rsx_vf_l","flux_rsy_vf_l","flux_src_rsy_vf_l","flux_rsz_vf_l","flux_src_rsz_vf_l"])
 
     real(wp) :: c           !< Cell averaged speed of sound
     real(wp), dimension(2) :: Re          !< Cell averaged Reynolds numbers
-    $:DECLARE(create=["c","Re"])
+    $:GPU_DECLARE(create=["c","Re"])
 
     real(wp) :: dpres_ds !< Spatial derivatives in s-dir of pressure
-    $:DECLARE(create=["dpres_ds"])
+    $:GPU_DECLARE(create=["dpres_ds"])
 
     real(wp), allocatable, dimension(:) :: ds !< Cell-width distribution in the s-direction
 
@@ -97,21 +97,21 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_y !< Polynomial interpolant coefficients in y-dir
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_z !< Polynomial interpolant coefficients in z-dir
 
-    $:DECLARE(create=["ds","fd_coef_x","fd_coef_y","fd_coef_z","pi_coef_x","pi_coef_y","pi_coef_z"])
+    $:GPU_DECLARE(create=["ds","fd_coef_x","fd_coef_y","fd_coef_z","pi_coef_x","pi_coef_y","pi_coef_z"])
 
     !! The first dimension of the array identifies the polynomial, the
     !! second dimension identifies the position of its coefficients and the last
     !! dimension denotes the location of the CBC.
 
     type(int_bounds_info) :: is1, is2, is3 !< Indical bounds in the s1-, s2- and s3-directions
-    $:DECLARE(create=["is1","is2","is3"])
+    $:GPU_DECLARE(create=["is1","is2","is3"])
 
     integer :: dj
     integer :: bcxb, bcxe, bcyb, bcye, bczb, bcze
     integer :: cbc_dir, cbc_loc
     integer :: flux_cbc_index
-    $:DECLARE(create=["dj","bcxb","bcxe","bcyb","bcye","bczb","bcze"])
-    $:DECLARE(create=["cbc_dir", "cbc_loc","flux_cbc_index"])
+    $:GPU_DECLARE(create=["dj","bcxb","bcxe","bcyb","bcye","bczb","bcze"])
+    $:GPU_DECLARE(create=["cbc_dir", "cbc_loc","flux_cbc_index"])
 
     !! GRCBC inputs for subsonic inflow and outflow conditions consisting of
     !! inflow velocities, pressure, density and void fraction as well as
@@ -120,9 +120,9 @@ module m_cbc
     real(wp), allocatable, dimension(:) :: pres_in, pres_out, Del_in, Del_out
     real(wp), allocatable, dimension(:, :) :: vel_in, vel_out
     real(wp), allocatable, dimension(:, :) :: alpha_rho_in, alpha_in
-    $:DECLARE(create=["pres_in","pres_out","Del_in","Del_out"])
-    $:DECLARE(create=["vel_in","vel_out"])
-    $:DECLARE(create=["alpha_rho_in","alpha_in"])
+    $:GPU_DECLARE(create=["pres_in","pres_out","Del_in","Del_out"])
+    $:GPU_DECLARE(create=["vel_in","vel_out"])
+    $:GPU_DECLARE(create=["alpha_rho_in","alpha_in"])
 
 contains
 
@@ -139,7 +139,7 @@ contains
         else
             flux_cbc_index = adv_idx%end
         end if
-        $:UPDATE(device=["flux_cbc_index"])
+        $:GPU_UPDATE(device=["flux_cbc_index"])
 
         call s_any_cbc_boundaries(is_cbc)
 
@@ -389,7 +389,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["fd_coef_x","fd_coef_y","fd_coef_z", &
+        $:GPU_UPDATE(device=["fd_coef_x","fd_coef_y","fd_coef_z", &
             & "pi_coef_x","pi_coef_y","pi_coef_z"])
 
         ! Associating the procedural pointer to the appropriate subroutine
@@ -398,20 +398,20 @@ contains
         bcxb = bc_x%beg
         bcxe = bc_x%end
 
-        $:UPDATE(device=["bcxb", "bcxe"])
+        $:GPU_UPDATE(device=["bcxb", "bcxe"])
 
         if (n > 0) then
             bcyb = bc_y%beg
             bcye = bc_y%end
 
-            $:UPDATE(device=["bcyb", "bcye"])
+            $:GPU_UPDATE(device=["bcyb", "bcye"])
         end if
 
         if (p > 0) then
             bczb = bc_z%beg
             bcze = bc_z%end
 
-            $:UPDATE(device=["bczb", "bcze"])
+            $:GPU_UPDATE(device=["bczb", "bcze"])
         end if
 
         ! Allocate GRCBC inputs
@@ -443,7 +443,7 @@ contains
                 end do
             end if
         #:endfor
-        $:UPDATE(device=["vel_in","vel_out","pres_in","pres_out", &
+        $:GPU_UPDATE(device=["vel_in","vel_out","pres_in","pres_out", &
             & "Del_in","Del_out","alpha_rho_in","alpha_in"])
 
     end subroutine s_initialize_cbc_module
@@ -608,7 +608,7 @@ contains
 
         end if
 
-        $:UPDATE(device=["ds"])
+        $:GPU_UPDATE(device=["ds"])
 
     end subroutine s_associate_cbc_coefficients_pointers
 
@@ -684,7 +684,7 @@ contains
         cbc_dir = cbc_dir_norm
         cbc_loc = cbc_loc_norm
 
-        $:UPDATE(device=["cbc_dir", "cbc_loc"])
+        $:GPU_UPDATE(device=["cbc_dir", "cbc_loc"])
 
         call s_initialize_cbc(q_prim_vf, flux_vf, flux_src_vf, &
                               ix, iy, iz)
@@ -702,7 +702,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = 1, flux_cbc_index
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -714,7 +714,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = advxb, advxe
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -733,7 +733,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = 1, flux_cbc_index
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -753,7 +753,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = advxb, advxe
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -776,7 +776,7 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
-                $:PARALLEL_LOOP(collapse=2, private=["alpha_rho", "vel", "adv", &
+                $:GPU_PARALLEL_LOOP(collapse=2, private=["alpha_rho", "vel", "adv", &
                     "mf", "dvel_ds", "dadv_ds", "Re_cbc", "dalpha_rho_ds","dvel_dt", &
                     "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", &
                     "dYs_ds", "h_k", "Cp_i", "Gamma_i", "Xs"])
@@ -784,25 +784,25 @@ contains
                     do k = is2%beg, is2%end
 
                         ! Transferring the Primitive Variables
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_dims
                             vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i)
                         end do
 
                         vel_K_sum = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_dims
                             vel_K_sum = vel_K_sum + vel(i)**2._wp
                         end do
 
                         pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx)
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, advxe - E_idx
                             adv(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i)
                         end do
@@ -813,13 +813,13 @@ contains
                             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc)
                         end if
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             mf(i) = alpha_rho(i)/rho
                         end do
 
                         if (chemistry) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = chemxb, chemxe
                                 Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                             end do
@@ -852,39 +852,39 @@ contains
 
                         ! First-Order Spatial Derivatives of Primitive Variables
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             dalpha_rho_ds(i) = 0._wp
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_dims
                             dvel_ds(i) = 0._wp
                         end do
 
                         dpres_ds = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, advxe - E_idx
                             dadv_ds(i) = 0._wp
                         end do
 
                         if (chemistry) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_species
                                 dYs_ds(i) = 0._wp
                             end do
                         end if
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do j = 0, buff_size
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, contxe
                                 dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* &
                                                    fd_coef_${XYZ}$ (j, cbc_loc) + &
                                                    dalpha_rho_ds(i)
                             end do
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_dims
                                 dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -894,7 +894,7 @@ contains
                             dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* &
                                        fd_coef_${XYZ}$ (j, cbc_loc) + &
                                        dpres_ds
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -902,7 +902,7 @@ contains
                             end do
 
                             if (chemistry) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_species
                                     dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* &
                                                 fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -929,7 +929,7 @@ contains
                             call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
                             ! Add GRCBC for Subsonic Inflow
                             if (bc_${XYZ}$%grcbc_in) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 2, momxb
                                     L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -939,7 +939,7 @@ contains
                                         L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$)
                                     end if
                                 end if
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = E_idx, advxe - 1
                                     L(i) = c*Ma*(adv(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -979,13 +979,13 @@ contains
                             dpres_dt = -5e-1_wp*(L(advxe) + L(1))
                         end if
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             dalpha_rho_dt(i) = &
                                 -(L(i + 1) - mf(i)*dpres_dt)/(c*c)
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_dims
                             dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* &
                                                   (L(1) - L(advxe))/(2._wp*rho*c) + &
@@ -994,13 +994,13 @@ contains
                         end do
 
                         vel_dv_dt_sum = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_dims
                             vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i)
                         end do
 
                         if (chemistry) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_species
                                 dYs_dt(i) = -1._wp*L(chemxb + i - 1)
                             end do
@@ -1008,12 +1008,12 @@ contains
 
                         ! The treatment of void fraction source is unclear
                         if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i) !+ adv(i) * vel(dir_idx(1))/y_cc(n)
                             end do
                         else
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i)
                             end do
@@ -1026,7 +1026,7 @@ contains
                             dgamma_dt = dadv_dt(1)
                             dpi_inf_dt = dadv_dt(2)
                         else
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 drho_dt = drho_dt + dalpha_rho_dt(i)
                                 dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i)
@@ -1036,13 +1036,13 @@ contains
                         end if
 
                         ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, contxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*dalpha_rho_dt(i)
                         end do
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = momxb, momxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*(vel(i - contxe)*drho_dt &
@@ -1053,14 +1053,14 @@ contains
                             ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990
                             call get_species_enthalpies_rt(T, h_k)
                             sum_Enthalpies = 0._wp
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_species
                                 h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
                                 sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
                             end do
                             flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) &
                                                                    + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies)
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_species
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) &
                                                                                 + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i))
@@ -1076,12 +1076,12 @@ contains
                         end if
 
                         if (riemann_solver == 1) then
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp
                             end do
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = &
                                     1._wp/max(abs(vel(dir_idx(1))), sgm_eps) &
@@ -1094,13 +1094,13 @@ contains
 
                         else
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + &
                                                                    ds(0)*dadv_dt(i - E_idx)
                             end do
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i)
                             end do
@@ -1164,13 +1164,13 @@ contains
         end if
 
         dj = max(0, cbc_loc)
-        $:UPDATE(device=["is1","is2","is3","dj"])
-        $:UPDATE(device=["dir_idx","dir_flg"])
+        $:GPU_UPDATE(device=["is1","is2","is3","dj"])
+        $:GPU_UPDATE(device=["dir_idx","dir_flg"])
 
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1182,7 +1182,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1193,7 +1193,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1206,7 +1206,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1217,7 +1217,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1229,7 +1229,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1246,7 +1246,7 @@ contains
             ! Reshaping Inputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1258,7 +1258,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1269,7 +1269,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1282,7 +1282,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1293,7 +1293,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1305,7 +1305,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1322,7 +1322,7 @@ contains
             ! Reshaping Inputted Data in z-direction
         else
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1334,7 +1334,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1345,7 +1345,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1358,7 +1358,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1369,7 +1369,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1381,7 +1381,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1415,12 +1415,12 @@ contains
 
         ! Determining the indicial shift based on CBC location
         dj = max(0, cbc_loc)
-        $:UPDATE(device=["dj"])
+        $:GPU_UPDATE(device=["dj"])
 
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1432,7 +1432,7 @@ contains
                     end do
                 end do
             end do
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1443,7 +1443,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1455,7 +1455,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1471,7 +1471,7 @@ contains
             ! Reshaping Outputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1484,7 +1484,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1495,7 +1495,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1507,7 +1507,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1524,7 +1524,7 @@ contains
             ! Reshaping Outputted Data in z-direction
         else
 
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1537,7 +1537,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1548,7 +1548,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1560,7 +1560,7 @@ contains
                     end do
                 end do
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 5880361461..d397d19cf5 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -87,7 +87,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -105,7 +105,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -135,7 +135,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -152,7 +152,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -176,7 +176,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -198,7 +198,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -220,7 +220,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(sys_size), intent(inout) :: L
         L(1:advxe) = 0._wp
@@ -232,7 +232,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 7407375f86..b1922c08fa 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -55,14 +55,14 @@ module m_data_output
     real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
     real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
     real(wp), public, allocatable, dimension(:, :) :: c_mass
-    $:DECLARE(create=["icfl_sf","vcfl_sf","ccfl_sf","Rc_sf"])
+    $:GPU_DECLARE(create=["icfl_sf","vcfl_sf","ccfl_sf","Rc_sf"])
 
     real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
     real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
     real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
     real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    $:DECLARE(create=["icfl_max_loc","icfl_max_glb","vcfl_max_loc","vcfl_max_glb"])
-    $:DECLARE(create=["ccfl_max_loc","ccfl_max_glb","Rc_min_loc","Rc_min_glb"])
+    $:GPU_DECLARE(create=["icfl_max_loc","icfl_max_glb","vcfl_max_loc","vcfl_max_glb"])
+    $:GPU_DECLARE(create=["ccfl_max_loc","ccfl_max_glb","Rc_min_loc","Rc_min_glb"])
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
@@ -279,7 +279,7 @@ contains
         integer :: j, k, l
 
         ! Computing Stability Criteria at Current Time-step
-        $:PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -303,10 +303,10 @@ contains
         ! Determining local stability criteria extrema at current time-step
 
 #ifdef _CRAYFTN
-        $:UPDATE(host=["icfl_sf"])
+        $:GPU_UPDATE(host=["icfl_sf"])
 
         if (viscous) then
-            $:UPDATE(host=["vcfl_sf","Rc_sf"])
+            $:GPU_UPDATE(host=["vcfl_sf","Rc_sf"])
         end if
 
         icfl_max_loc = maxval(icfl_sf)
@@ -527,7 +527,7 @@ contains
         if (prim_vars_wrt .or. (n == 0 .and. p == 0)) then
             call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
             do i = 1, sys_size
-                $:UPDATE(host=["q_prim_vf(i)%sf(:,:,:)"])
+                $:GPU_UPDATE(host=["q_prim_vf(i)%sf(:,:,:)"])
             end do
             ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
             if (qbmm) then
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index c8170d39a1..b0614f880a 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -47,12 +47,12 @@ module m_fftw
     !! Filtered complex data in Fourier space
 
 #if defined(MFC_OpenACC)
-    $:DECLARE(create=["real_size","cmplx_size","x_size","batch_size","Nfq"])
+    $:GPU_DECLARE(create=["real_size","cmplx_size","x_size","batch_size","Nfq"])
 
     real(dp), allocatable, target :: data_real_gpu(:)
     complex(dp), allocatable, target :: data_cmplx_gpu(:)
     complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
-    $:DECLARE(create=["data_real_gpu","data_cmplx_gpu","data_fltr_cmplx_gpu"])
+    $:GPU_DECLARE(create=["data_real_gpu","data_cmplx_gpu","data_fltr_cmplx_gpu"])
 
 #if defined(__PGI)
     integer :: fwd_plan_gpu, bwd_plan_gpu
@@ -91,7 +91,7 @@ contains
         iembed(1) = 0
         oembed(1) = 0
         !$acc enter data copyin(real_size, cmplx_size, x_size, sys_size, batch_size, Nfq)
-        $:UPDATE(device=["real_size","cmplx_size","x_size","sys_size","batch_size"])
+        $:GPU_UPDATE(device=["real_size","cmplx_size","x_size","sys_size","batch_size"])
 #else
         ! Allocate input and output DFT data sizes
         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
@@ -140,7 +140,7 @@ contains
         if (bc_y%beg >= 0) return
 #if defined(MFC_OpenACC)
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
@@ -149,7 +149,7 @@ contains
             end do
         end do
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -172,9 +172,9 @@ contains
 #endif
         !$acc end host_data
         Nfq = 3
-        $:UPDATE(device=["Nfq"])
+        $:GPU_UPDATE(device=["Nfq"])
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, Nfq
@@ -192,7 +192,7 @@ contains
 #endif
         !$acc end host_data
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -204,7 +204,7 @@ contains
 
         do i = 1, fourier_rings
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, cmplx_size
@@ -213,7 +213,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3, firstprivate=["i"])
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
@@ -232,9 +232,9 @@ contains
             !$acc end host_data
 
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            $:UPDATE(device=["Nfq"])
+            $:GPU_UPDATE(device=["Nfq"])
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, Nfq
@@ -252,7 +252,7 @@ contains
 #endif
             !$acc end host_data
 
-            $:PARALLEL_LOOP(collapse=3, firstprivate=["i"])
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate=["i"])
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index e2e14286d7..a91b2d49b6 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -54,7 +54,7 @@ module m_global_parameters
     logical :: cyl_coord
     integer :: grid_geometry
     !> @}
-    $:DECLARE(create=["cyl_coord","grid_geometry"])
+    $:GPU_DECLARE(create=["cyl_coord","grid_geometry"])
 
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
@@ -77,7 +77,7 @@ module m_global_parameters
 
     real(wp) :: dt !< Size of the time-step
 
-    $:DECLARE(create=["x_cb","y_cb","z_cb","x_cc","y_cc","z_cc","dx","dy","dz","dt","m","n","p"])
+    $:GPU_DECLARE(create=["x_cb","y_cb","z_cb","x_cc","y_cc","z_cc","dx","dy","dz","dt","m","n","p"])
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number
     !! of time-step iterations between successive solution backups, respectively
@@ -91,7 +91,7 @@ module m_global_parameters
     real(wp) :: t_stop, t_save, cfl_target
     integer :: n_start
     !> @}
-    $:DECLARE(create=["cfl_target"])
+    $:GPU_DECLARE(create=["cfl_target"])
 
     logical :: cfl_adap_dt, cfl_const_dt, cfl_dt
 
@@ -159,7 +159,7 @@ module m_global_parameters
     logical :: bulk_stress   !< Bulk stresses
     logical :: cont_damage   !< Continuum damage modeling
 
-    $:DECLARE(create=["chemistry"])
+    $:GPU_DECLARE(create=["chemistry"])
 
     logical :: bodyForces
     logical :: bf_x, bf_y, bf_z !< body force toggle in three directions
@@ -170,27 +170,27 @@ module m_global_parameters
         #:endfor
     #:endfor
     real(wp), dimension(3) :: accel_bf
-    $:DECLARE(create=["accel_bf"])
+    $:GPU_DECLARE(create=["accel_bf"])
 
     integer :: cpu_start, cpu_end, cpu_rate
 
     #:if not MFC_CASE_OPTIMIZATION
-        $:DECLARE(create=["num_dims","num_vels","weno_polyn","weno_order"])
-        $:DECLARE(create=["weno_num_stencils","num_fluids","wenojs"])
-        $:DECLARE(create=["mapped_weno", "wenoz","teno","wenoz_q","mhd","relativity"])
+        $:GPU_DECLARE(create=["num_dims","num_vels","weno_polyn","weno_order"])
+        $:GPU_DECLARE(create=["weno_num_stencils","num_fluids","wenojs"])
+        $:GPU_DECLARE(create=["mapped_weno", "wenoz","teno","wenoz_q","mhd","relativity"])
     #:endif
 
-    $:DECLARE(create=["mpp_lim","model_eqns","mixture_err","alt_soundspeed"])
-    $:DECLARE(create=["avg_state","mp_weno","weno_eps","teno_CT","hypoelasticity"])
-    $:DECLARE(create=["hyperelasticity","hyper_model","elasticity","low_Mach"])
-    $:DECLARE(create=["viscous","shear_stress","bulk_stress","cont_damage"])
+    $:GPU_DECLARE(create=["mpp_lim","model_eqns","mixture_err","alt_soundspeed"])
+    $:GPU_DECLARE(create=["avg_state","mp_weno","weno_eps","teno_CT","hypoelasticity"])
+    $:GPU_DECLARE(create=["hyperelasticity","hyper_model","elasticity","low_Mach"])
+    $:GPU_DECLARE(create=["viscous","shear_stress","bulk_stress","cont_damage"])
 
     logical :: relax          !< activate phase change
     integer :: relax_model    !< Relaxation model
     real(wp) :: palpha_eps     !< trigger parameter for the p relaxation procedure, phase change model
     real(wp) :: ptgalpha_eps   !< trigger parameter for the pTg relaxation procedure, phase change model
 
-    $:DECLARE(create=["relax", "relax_model", "palpha_eps","ptgalpha_eps"])
+    $:GPU_DECLARE(create=["relax", "relax_model", "palpha_eps","ptgalpha_eps"])
 
     integer :: num_bc_patches
     logical :: bc_io
@@ -200,9 +200,9 @@ module m_global_parameters
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
     !> @}
-    $:DECLARE(create=["bc_x%vb1", "bc_x%vb2", "bc_x%vb3", "bc_x%ve1", "bc_x%ve2", "bc_x%ve3"])
-    $:DECLARE(create=["bc_y%vb1", "bc_y%vb2", "bc_y%vb3", "bc_y%ve1", "bc_y%ve2", "bc_y%ve3"])
-    $:DECLARE(create=["bc_z%vb1", "bc_z%vb2", "bc_z%vb3", "bc_z%ve1", "bc_z%ve2", "bc_z%ve3"])
+    $:GPU_DECLARE(create=["bc_x%vb1", "bc_x%vb2", "bc_x%vb3", "bc_x%ve1", "bc_x%ve2", "bc_x%ve3"])
+    $:GPU_DECLARE(create=["bc_y%vb1", "bc_y%vb2", "bc_y%vb3", "bc_y%ve1", "bc_y%ve2", "bc_y%ve3"])
+    $:GPU_DECLARE(create=["bc_z%vb1", "bc_z%vb2", "bc_z%vb3", "bc_z%ve1", "bc_z%ve2", "bc_z%ve3"])
 
     type(bounds_info) :: x_domain, y_domain, z_domain
     real(wp) :: x_a, y_a, z_a
@@ -256,20 +256,20 @@ module m_global_parameters
     integer :: c_idx                                   !< Index of color function
     integer :: damage_idx                              !< Index of damage state variable (D) for continuum damage model
     !> @}
-    $:DECLARE(create=["sys_size","E_idx","n_idx","bub_idx","alf_idx","gamma_idx"])
-    $:DECLARE(create=["pi_inf_idx","B_idx","stress_idx","xi_idx","b_size"])
-    $:DECLARE(create=["tensor_size","species_idx","c_idx"])
+    $:GPU_DECLARE(create=["sys_size","E_idx","n_idx","bub_idx","alf_idx","gamma_idx"])
+    $:GPU_DECLARE(create=["pi_inf_idx","B_idx","stress_idx","xi_idx","b_size"])
+    $:GPU_DECLARE(create=["tensor_size","species_idx","c_idx"])
 
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p).
     ! Stands for "InDices With INTerior".
     type(int_bounds_info) :: idwint(1:3)
-    $:DECLARE(create=["idwint"])
+    $:GPU_DECLARE(create=["idwint"])
 
     ! Cell Indices for the entire (local) domain. In simulation and post_process,
     ! this includes the buffer region. idwbuff and idwint are the same otherwise.
     ! Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
-    $:DECLARE(create=["idwbuff"])
+    $:GPU_DECLARE(create=["idwbuff"])
 
     !> @name The number of fluids, along with their identifying indexes, respectively,
     !! for which viscous effects, e.g. the shear and/or the volume Reynolds (Re)
@@ -279,7 +279,7 @@ module m_global_parameters
     integer, allocatable, dimension(:, :) :: Re_idx
     !> @}
 
-    $:DECLARE(create=["Re_size","Re_idx"])
+    $:GPU_DECLARE(create=["Re_size","Re_idx"])
 
     ! The WENO average (WA) flag regulates whether the calculation of any cell-
     ! average spatial derivatives is carried out in each cell by utilizing the
@@ -290,7 +290,7 @@ module m_global_parameters
     real(wp) :: wa_flg
     !> @{
 
-    $:DECLARE(create=["wa_flg"])
+    $:GPU_DECLARE(create=["wa_flg"])
 
     !> @name The coordinate direction indexes and flags (flg), respectively, for which
     !! the configurations will be determined with respect to a working direction
@@ -302,14 +302,14 @@ module m_global_parameters
     integer, dimension(3) :: dir_idx_tau !!used for hypoelasticity=true
     !> @}
 
-    $:DECLARE(create=["dir_idx","dir_flg","dir_idx_tau"])
+    $:GPU_DECLARE(create=["dir_idx","dir_flg","dir_idx_tau"])
 
     integer :: buff_size !<
     !! The number of cells that are necessary to be able to store enough boundary
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
-    $:DECLARE(create=["buff_size"])
+    $:GPU_DECLARE(create=["buff_size"])
 
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
@@ -320,7 +320,7 @@ module m_global_parameters
     !! Indices of shear stress components to reflect for boundary conditions.
     !! Size: (1:3, 1:shear_BC_flip_num) for (x/y/z, [indices])
 
-    $:DECLARE(create=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
+    $:GPU_DECLARE(create=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
 
     ! END: Simulation Algorithm Parameters
 
@@ -340,7 +340,7 @@ module m_global_parameters
     !! The finite-difference number is given by MAX(1, fd_order/2). Essentially,
     !! it is a measure of the half-size of the finite-difference stencil for the
     !! selected order of accuracy.
-    $:DECLARE(create=["fd_order","fd_number"])
+    $:GPU_DECLARE(create=["fd_order","fd_number"])
 
     logical :: probe_wrt
     logical :: integral_wrt
@@ -353,7 +353,7 @@ module m_global_parameters
     !> @{
     real(wp) :: rhoref, pref
     !> @}
-    $:DECLARE(create=["rhoref","pref"])
+    $:GPU_DECLARE(create=["rhoref","pref"])
 
     !> @name Immersed Boundaries
     !> @{
@@ -368,7 +368,7 @@ module m_global_parameters
     !! the maximum allowable number of patches, num_patches_max, may be changed
     !! in the module m_derived_types.f90.
 
-    $:DECLARE(create=["ib","num_ibs","patch_ib"])
+    $:GPU_DECLARE(create=["ib","num_ibs","patch_ib"])
     !> @}
 
     !> @name Bubble modeling
@@ -383,31 +383,31 @@ module m_global_parameters
     real(wp) :: Ca       !< Cavitation number
     real(wp) :: Web      !< Weber number
     real(wp) :: Re_inv   !< Inverse Reynolds number
-    $:DECLARE(create=["R0ref","Ca","Web","Re_inv"])
+    $:GPU_DECLARE(create=["R0ref","Ca","Web","Re_inv"])
 
     real(wp), dimension(:), allocatable :: weight !< Simpson quadrature weights
     real(wp), dimension(:), allocatable :: R0     !< Bubble sizes
     real(wp), dimension(:), allocatable :: V0     !< Bubble velocities
-    $:DECLARE(create=["weight","R0","V0"])
+    $:GPU_DECLARE(create=["weight","R0","V0"])
 
     logical :: bubbles_euler      !< Bubbles euler on/off
     logical :: polytropic   !< Polytropic  switch
     logical :: polydisperse !< Polydisperse bubbles
-    $:DECLARE(create=["bubbles_euler","polytropic","polydisperse"])
+    $:GPU_DECLARE(create=["bubbles_euler","polytropic","polydisperse"])
 
     logical :: adv_n        !< Solve the number density equation and compute alpha from number density
     logical :: adap_dt      !< Adaptive step size control
     real(wp) :: adap_dt_tol !< Tolerance to control adaptive step size
-    $:DECLARE(create=["adv_n","adap_dt","adap_dt_tol"])
+    $:GPU_DECLARE(create=["adv_n","adap_dt","adap_dt_tol"])
 
     integer :: bubble_model !< Gilmore or Keller--Miksis bubble model
     integer :: thermal      !< Thermal behavior. 1 = adiabatic, 2 = isotherm, 3 = transfer
-    $:DECLARE(create=["bubble_model","thermal"])
+    $:GPU_DECLARE(create=["bubble_model","thermal"])
 
     real(wp), allocatable, dimension(:, :, :) :: ptil  !< Pressure modification
 
     real(wp) :: poly_sigma  !< log normal sigma for polydisperse PDF
-    $:DECLARE(create=["ptil", "poly_sigma"])
+    $:GPU_DECLARE(create=["ptil", "poly_sigma"])
 
     logical :: qbmm      !< Quadrature moment method
     integer, parameter :: nmom = 6 !< Number of carried moments per R0 location
@@ -416,39 +416,39 @@ module m_global_parameters
     integer :: R0_type
 
     real(wp) :: pi_fac   !< Factor for artificial pi_inf
-    $:DECLARE(create=["qbmm", "nmomsp","nmomtot","R0_type","pi_fac"])
+    $:GPU_DECLARE(create=["qbmm", "nmomsp","nmomtot","R0_type","pi_fac"])
 
     #:if not MFC_CASE_OPTIMIZATION
-        $:DECLARE(create=["nb"])
+        $:GPU_DECLARE(create=["nb"])
     #:endif
 
     type(scalar_field), allocatable, dimension(:) :: mom_sp
     type(scalar_field), allocatable, dimension(:, :, :) :: mom_3d
-    $:DECLARE(create=["mom_sp","mom_3d"])
+    $:GPU_DECLARE(create=["mom_sp","mom_3d"])
 
     !> @}
 
     type(chemistry_parameters) :: chem_params
-    $:DECLARE(create=["chem_params"])
+    $:GPU_DECLARE(create=["chem_params"])
 
     !> @name Physical bubble parameters (see Ando 2010, Preston 2007)
     !> @{
 
     real(wp) :: R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v
-    $:DECLARE(create=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw",])
-    $:DECLARE(create=["pv","M_n", "M_v","k_vl","k_nl","cp_n","cp_v"])
+    $:GPU_DECLARE(create=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw",])
+    $:GPU_DECLARE(create=["pv","M_n", "M_v","k_vl","k_nl","cp_n","cp_v"])
 
     real(wp), dimension(:), allocatable :: k_n, k_v, pb0, mass_n0, mass_v0, Pe_T
     real(wp), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN
-    $:DECLARE(create=["k_n","k_v","pb0","mass_n0","mass_v0","Pe_T"])
-    $:DECLARE(create=["Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN"])
+    $:GPU_DECLARE(create=["k_n","k_v","pb0","mass_n0","mass_v0","Pe_T"])
+    $:GPU_DECLARE(create=["Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN"])
 
     real(wp) :: mul0, ss, gamma_v, mu_v
     real(wp) :: gamma_m, gamma_n, mu_n
     real(wp) :: gam
     !> @}
 
-    $:DECLARE(create=["mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
+    $:GPU_DECLARE(create=["mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
 
     !> @name Acoustic acoustic_source parameters
     !> @{
@@ -456,14 +456,14 @@ module m_global_parameters
     type(acoustic_parameters), dimension(num_probes_max) :: acoustic !< Acoustic source parameters
     integer :: num_source !< Number of acoustic sources
     !> @}
-    $:DECLARE(create=["acoustic_source","acoustic","num_source"])
+    $:GPU_DECLARE(create=["acoustic_source","acoustic","num_source"])
 
     !> @name Surface tension parameters
     !> @{
 
     real(wp) :: sigma
     logical :: surface_tension
-    $:DECLARE(create=["sigma","surface_tension"])
+    $:GPU_DECLARE(create=["sigma","surface_tension"])
     !> @}
 
     integer :: momxb, momxe
@@ -474,13 +474,13 @@ module m_global_parameters
     integer :: strxb, strxe
     integer :: chemxb, chemxe
     integer :: xibeg, xiend
-    $:DECLARE(create=["momxb","momxe","advxb","advxe","contxb","contxe"])
-    $:DECLARE(create=["intxb","intxe", "bubxb","bubxe"])
-    $:DECLARE(create=["strxb","strxe","chemxb","chemxe"])
-    $:DECLARE(create=["xibeg","xiend"])
+    $:GPU_DECLARE(create=["momxb","momxe","advxb","advxe","contxb","contxe"])
+    $:GPU_DECLARE(create=["intxb","intxe", "bubxb","bubxe"])
+    $:GPU_DECLARE(create=["strxb","strxe","chemxb","chemxe"])
+    $:GPU_DECLARE(create=["xibeg","xiend"])
 
     real(wp), allocatable, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    $:DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
+    $:GPU_DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
 
     real(wp) :: mytime       !< Current simulation time
     real(wp) :: finaltime    !< Final simulation time
@@ -491,25 +491,25 @@ module m_global_parameters
 
     type(pres_field), allocatable, dimension(:) :: mv_ts
 
-    $:DECLARE(create=["pb_ts","mv_ts"])
+    $:GPU_DECLARE(create=["pb_ts","mv_ts"])
 
     !> @name lagrangian subgrid bubble parameters
     !> @{!
     logical :: bubbles_lagrange                         !< Lagrangian subgrid bubble model switch
     type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
-    $:DECLARE(create=["bubbles_lagrange","lag_params"])
+    $:GPU_DECLARE(create=["bubbles_lagrange","lag_params"])
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
     logical :: powell !< Powell‐correction for div B = 0
-    $:DECLARE(create=["Bx0","powell"])
+    $:GPU_DECLARE(create=["Bx0","powell"])
 
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star        !< Stress threshold for continuum damage modeling
     real(wp) :: cont_damage_s   !< Exponent s for continuum damage modeling
     real(wp) :: alpha_bar       !< Damage rate factor for continuum damage modeling
-    $:DECLARE(create=["tau_star","cont_damage_s","alpha_bar"])
+    $:GPU_DECLARE(create=["tau_star","cont_damage_s","alpha_bar"])
     !> @}
 
 contains
@@ -802,10 +802,10 @@ contains
             else
                 weno_num_stencils = weno_polyn
             end if
-            $:UPDATE(device=["weno_polyn"])
-            $:UPDATE(device=["weno_num_stencils"])
-            $:UPDATE(device=["nb"])
-            $:UPDATE(device=["num_dims","num_vels","num_fluids"])
+            $:GPU_UPDATE(device=["weno_polyn"])
+            $:GPU_UPDATE(device=["weno_num_stencils"])
+            $:GPU_UPDATE(device=["nb"])
+            $:GPU_UPDATE(device=["num_dims","num_vels","num_fluids"])
         #:endif
 
         ! Initializing the number of fluids for which viscous effects will
@@ -1041,7 +1041,7 @@ contains
             if (Re_size(1) > 0._wp) shear_stress = .true.
             if (Re_size(2) > 0._wp) bulk_stress = .true.
 
-            $:UPDATE(device=["Re_size","viscous","shear_stress","bulk_stress"])
+            $:GPU_UPDATE(device=["Re_size","viscous","shear_stress","bulk_stress"])
 
             ! Bookkeeping the indexes of any viscous fluids and any pairs of
             ! fluids whose interface will support effects of surface tension
@@ -1097,7 +1097,7 @@ contains
                     ! y-dir: flip tau_xy and tau_yz
                     ! z-dir: flip tau_xz and tau_yz
                 end if
-                $:UPDATE(device=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
+                $:GPU_UPDATE(device=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
             end if
 
             if (hyperelasticity) then
@@ -1164,7 +1164,7 @@ contains
         ! cell-boundary values or otherwise, the unaltered left and right,
         ! WENO-reconstructed, cell-boundary values
         wa_flg = 0._wp; if (weno_avg) wa_flg = 1._wp
-        $:UPDATE(device=["wa_flg"])
+        $:GPU_UPDATE(device=["wa_flg"])
 
         ! Resort to default WENO-JS if no other WENO scheme is selected
         #:if not MFC_CASE_OPTIMIZATION
@@ -1174,7 +1174,7 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
         Np = 0
 
-        $:UPDATE(device=["Re_size"])
+        $:GPU_UPDATE(device=["Re_size"])
 
         if (elasticity) then
             fd_number = max(1, fd_order/2)
@@ -1192,7 +1192,7 @@ contains
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
                                            num_dims)
-        $:UPDATE(device=["idwint", "idwbuff"])
+        $:GPU_UPDATE(device=["idwint", "idwbuff"])
 
         ! Configuring Coordinate Direction Indexes
         if (bubbles_euler) then
@@ -1202,7 +1202,7 @@ contains
                 & idwbuff(3)%beg:idwbuff(3)%end))
         end if
 
-        $:UPDATE(device=["fd_order", "fd_number"])
+        $:GPU_UPDATE(device=["fd_order", "fd_number"])
 
         if (cyl_coord .neqv. .true.) then ! Cartesian grid
             grid_geometry = 1
@@ -1229,17 +1229,17 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
-        $:UPDATE(device=["momxb","momxe","advxb","advxe","contxb","contxe", &
+        $:GPU_UPDATE(device=["momxb","momxe","advxb","advxe","contxb","contxe", &
             & "bubxb","bubxe","intxb","intxe","sys_size","buff_size","E_idx", &
             & "alf_idx","n_idx","adv_n","adap_dt","pi_fac","strxb","strxe", &
             & "chemxb","chemxe","c_idx"])
-        $:UPDATE(device=["b_size","xibeg","xiend","tensor_size"])
+        $:GPU_UPDATE(device=["b_size","xibeg","xiend","tensor_size"])
 
-        $:UPDATE(device=["species_idx"])
-        $:UPDATE(device=["cfl_target","m","n","p"])
+        $:GPU_UPDATE(device=["species_idx"])
+        $:GPU_UPDATE(device=["cfl_target","m","n","p"])
 
-        $:UPDATE(device=["alt_soundspeed","acoustic_source","num_source"])
-        $:UPDATE(device=["dt","sys_size","buff_size","pref","rhoref", &
+        $:GPU_UPDATE(device=["alt_soundspeed","acoustic_source","num_source"])
+        $:GPU_UPDATE(device=["dt","sys_size","buff_size","pref","rhoref", &
             & "gamma_idx","pi_inf_idx","E_idx","alf_idx","stress_idx", &
             & "mpp_lim","bubbles_euler","hypoelasticity","alt_soundspeed", &
             & "avg_state","num_fluids","model_eqns","num_dims","num_vels", &
@@ -1247,14 +1247,14 @@ contains
             & "teno_CT","hyperelasticity","hyper_model","elasticity","xi_idx", &
             & "B_idx","low_Mach"])
 
-        $:UPDATE(device=["Bx0", "powell"])
+        $:GPU_UPDATE(device=["Bx0", "powell"])
 
-        $:UPDATE(device=["cont_damage","tau_star","cont_damage_s","alpha_bar"])
+        $:GPU_UPDATE(device=["cont_damage","tau_star","cont_damage_s","alpha_bar"])
 
         #:if not MFC_CASE_OPTIMIZATION
-            $:UPDATE(device=["wenojs","mapped_weno","wenoz","teno"])
-            $:UPDATE(device=["wenoz_q"])
-            $:UPDATE(device=["mhd", "relativity"])
+            $:GPU_UPDATE(device=["wenojs","mapped_weno","wenoz","teno"])
+            $:GPU_UPDATE(device=["wenoz_q"])
+            $:GPU_UPDATE(device=["mhd", "relativity"])
         #:endif
 
         !$acc enter data copyin(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma)
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 994412a567..dd232a7cd8 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -27,14 +27,14 @@ module m_hyperelastic
     !! The btensor at the cell-interior Gaussian quadrature points.
     !! These tensor is needed to be calculated once and make the code DRY.
     type(vector_field) :: btensor !<
-    $:DECLARE(create=["btensor"])
+    $:GPU_DECLARE(create=["btensor"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z
-    $:DECLARE(create=["fd_coeff_x","fd_coeff_y", "fd_coeff_z"])
+    $:GPU_DECLARE(create=["fd_coeff_x","fd_coeff_y", "fd_coeff_z"])
     real(wp), allocatable, dimension(:) :: Gs
-    $:DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create=["Gs"])
 
 contains
 
@@ -56,11 +56,11 @@ contains
         @:ACC_SETUP_VFs(btensor)
 
         @:ALLOCATE(Gs(1:num_fluids))
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device=["Gs"])
 
         @:ALLOCATE(fd_coeff_x(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -73,16 +73,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, &
                                                       fd_number, fd_order)
-        $:UPDATE(device=["fd_coeff_x"])
+        $:GPU_UPDATE(device=["fd_coeff_x"])
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, &
                                                           fd_number, fd_order)
-            $:UPDATE(device=["fd_coeff_y"])
+            $:GPU_UPDATE(device=["fd_coeff_y"])
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, &
                                                           fd_number, fd_order)
-            $:UPDATE(device=["fd_coeff_z"])
+            $:GPU_UPDATE(device=["fd_coeff_z"])
         end if
 
     end subroutine s_initialize_hyperelastic_module
@@ -107,12 +107,12 @@ contains
         real(wp) :: G
         integer :: j, k, l, i, r
 
-        $:PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
             "gamma", "pi_inf", "qv", "G", "Re", "tensora", "tensorb"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = 1, num_fluids
                         alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l)
                         alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -125,7 +125,7 @@ contains
                     !if ( G <= verysmall ) G_K = 0_wp
 
                     if (G > verysmall) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, tensor_size
                             tensora(i) = 0_wp
                         end do
@@ -134,7 +134,7 @@ contains
                         ! number for the tensor 1-3:  dxix_dx, dxiy_dx, dxiz_dx
                         ! 4-6 :                       dxix_dy, dxiy_dy, dxiz_dy
                         ! 7-9 :                       dxix_dz, dxiy_dz, dxiz_dz
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do r = -fd_number, fd_number
                             ! derivatives in the x-direction
                             tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x(r, j)
@@ -168,7 +168,7 @@ contains
                         if (tensorb(tensor_size) > verysmall) then
                             ! STEP 2c: computing the inverse of grad_xi tensor = F
                             ! tensorb is the adjoint, tensora becomes F
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, tensor_size - 1
                                 tensora(i) = tensorb(i)/tensorb(tensor_size)
                             end do
@@ -199,7 +199,7 @@ contains
                             q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - &
                                                            G*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma
                             ! STEP 5c: updating the Cauchy stress conservative scalar field
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, b_size - 1
                                 q_cons_vf(strxb + i - 1)%sf(j, k, l) = &
                                     rho*q_prim_vf(strxb + i - 1)%sf(j, k, l)
@@ -221,7 +221,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_neoHookean_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -240,7 +240,7 @@ contains
         #:endfor
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
@@ -260,7 +260,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -281,7 +281,7 @@ contains
 
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 16c3008687..73f6383f69 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -21,20 +21,20 @@ module m_hypoelastic
  s_compute_damage_state
 
     real(wp), allocatable, dimension(:) :: Gs
-    $:DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create=["Gs"])
 
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    $:DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
+    $:GPU_DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
 
     real(wp), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field
-    $:DECLARE(create=["rho_K_field","G_K_field"])
+    $:GPU_DECLARE(create=["rho_K_field","G_K_field"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    $:DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
+    $:GPU_DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
 
 contains
 
@@ -56,7 +56,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device=["Gs"])
 
         @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -69,16 +69,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, &
                                                       fd_number, fd_order)
-        $:UPDATE(device=["fd_coeff_x_h"])
+        $:GPU_UPDATE(device=["fd_coeff_x_h"])
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, &
                                                           fd_number, fd_order)
-            $:UPDATE(device=["fd_coeff_y_h"])
+            $:GPU_UPDATE(device=["fd_coeff_y_h"])
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, &
                                                           fd_number, fd_order)
-            $:UPDATE(device=["fd_coeff_z_h"])
+            $:GPU_UPDATE(device=["fd_coeff_z_h"])
         end if
 
     end subroutine s_initialize_hypoelastic_module
@@ -105,7 +105,7 @@ contains
             ! calculate velocity gradients + rho_K and G_K
             ! TODO: re-organize these loops one by one for GPU efficiency if possible?
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -115,11 +115,11 @@ contains
             end do
             !$acc end parallel loop
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do r = -fd_number, fd_number
                             du_dx(k, l, q) = du_dx(k, l, q) &
                                              + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
@@ -131,7 +131,7 @@ contains
             !$acc end parallel loop
 
             if (ndirs > 1) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -141,11 +141,11 @@ contains
                 end do
                 !$acc end parallel loop
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do r = -fd_number, fd_number
                                 du_dy(k, l, q) = du_dy(k, l, q) &
                                                  + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
@@ -162,7 +162,7 @@ contains
                 ! 3D
                 if (ndirs == 3) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -173,11 +173,11 @@ contains
                     end do
                     !$acc end parallel loop
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do r = -fd_number, fd_number
                                     du_dz(k, l, q) = du_dz(k, l, q) &
                                                      + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
@@ -197,7 +197,7 @@ contains
                 end if
             end if
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -221,7 +221,7 @@ contains
             end do
 
             ! apply rhs source term to elastic stress equation
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -235,7 +235,7 @@ contains
             end do
 
         elseif (idir == 2) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -270,7 +270,7 @@ contains
             end do
 
         elseif (idir == 3) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -338,7 +338,7 @@ contains
 
         if (cyl_coord .and. idir == 2) then
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -400,13 +400,13 @@ contains
 
         if (n == 0) then
             l = 0; q = 0
-            $:PARALLEL_LOOP()
+            $:GPU_PARALLEL_LOOP()
             do k = 0, m
                 rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(abs(q_cons_vf(stress_idx%beg)%sf(k, l, q)) - tau_star, 0._wp))**cont_damage_s
             end do
         elseif (p == 0) then
             q = 0
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, n
                 do k = 0, m
                     ! Maximum principal stress
@@ -420,7 +420,7 @@ contains
                 end do
             end do
         else
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 1db453b8a0..bc9a9e401e 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -36,15 +36,15 @@ module m_ibm
     type(integer_field), public :: ib_markers
     type(levelset_field), public :: levelset
     type(levelset_norm_field), public :: levelset_norm
-    $:DECLARE(create=["ib_markers","levelset","levelset_norm"])
+    $:GPU_DECLARE(create=["ib_markers","levelset","levelset_norm"])
 
     type(ghost_point), dimension(:), allocatable :: ghost_points
     type(ghost_point), dimension(:), allocatable :: inner_points
-    $:DECLARE(create=["ghost_points","inner_points"])
+    $:GPU_DECLARE(create=["ghost_points","inner_points"])
 
     integer :: num_gps !< Number of ghost points
     integer :: num_inner_gps !< Number of ghost points
-    $:DECLARE(create=["gp_layers","num_gps","num_inner_gps"])
+    $:GPU_DECLARE(create=["gp_layers","num_gps","num_inner_gps"])
 
 contains
 
@@ -81,31 +81,31 @@ contains
 
         integer :: i, j, k
 
-        $:UPDATE(device=["ib_markers%sf"])
-        $:UPDATE(device=["levelset%sf"])
-        $:UPDATE(device=["levelset_norm%sf"])
+        $:GPU_UPDATE(device=["ib_markers%sf"])
+        $:GPU_UPDATE(device=["levelset%sf"])
+        $:GPU_UPDATE(device=["levelset_norm%sf"])
 
         ! Get neighboring IB variables from other processors
         call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
 
-        $:UPDATE(host=["ib_markers%sf"])
+        $:GPU_UPDATE(host=["ib_markers%sf"])
 
         call s_find_num_ghost_points(num_gps, num_inner_gps)
 
-        $:UPDATE(device=["num_gps", "num_inner_gps"])
+        $:GPU_UPDATE(device=["num_gps", "num_inner_gps"])
         @:ALLOCATE(ghost_points(1:num_gps))
         @:ALLOCATE(inner_points(1:num_inner_gps))
 
         !$acc enter data copyin(ghost_points, inner_points)
 
         call s_find_ghost_points(ghost_points, inner_points)
-        $:UPDATE(device=["ghost_points", "inner_points"])
+        $:GPU_UPDATE(device=["ghost_points", "inner_points"])
 
         call s_compute_image_points(ghost_points, levelset, levelset_norm)
-        $:UPDATE(device=["ghost_points"])
+        $:GPU_UPDATE(device=["ghost_points"])
 
         call s_compute_interpolation_coeffs(ghost_points)
-        $:UPDATE(device=["ghost_points"])
+        $:GPU_UPDATE(device=["ghost_points"])
 
     end subroutine s_ibm_setup
 
@@ -153,7 +153,7 @@ contains
         type(ghost_point) :: gp
         type(ghost_point) :: innerp
 
-        $:PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
+        $:GPU_PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
             & "alpha_IP","pres_IP","vel_IP","vel_g","vel_norm_IP","r_IP", &
             & "v_IP","pb_IP","mv_IP","nmom_IP","presb_IP","massv_IP","rho", &
             & "gamma","pi_inf","Re_K","G_K","Gs","gp","innerp","norm","buf", &
@@ -194,7 +194,7 @@ contains
             dyn_pres = 0._wp
 
             ! Set q_prim_vf params at GP so that mixture vars calculated properly
-            $:LOOP()
+            $:GPU_LOOP()
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -230,7 +230,7 @@ contains
             end if
 
             ! Set momentum
-            $:LOOP()
+            $:GPU_LOOP()
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -238,7 +238,7 @@ contains
             end do
 
             ! Set continuity and adv vars
-            $:LOOP()
+            $:GPU_LOOP()
             do q = 1, num_fluids
                 q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -292,7 +292,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                $:LOOP()
+                $:GPU_LOOP()
                 do q = intxb, intxe
                     q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP &
                                                                         + pi_infs(q - intxb + 1))
@@ -301,7 +301,7 @@ contains
         end do
 
         !Correct the state of the inner points in IBs
-        $:PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
+        $:GPU_PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
             & "alpha_IP","vel_g","rho","gamma","pi_inf","Re_K","innerp", &
             & "j","k","l","q"])
         do i = 1, num_inner_gps
@@ -320,7 +320,7 @@ contains
                 physical_loc = [x_cc(j), y_cc(k), 0._wp]
             end if
 
-            $:LOOP()
+            $:GPU_LOOP()
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -335,7 +335,7 @@ contains
 
             dyn_pres = 0._wp
 
-            $:LOOP()
+            $:GPU_LOOP()
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -751,7 +751,7 @@ contains
     !> Function that uses the interpolation coefficients and the current state
     !! at the cell centers in order to estimate the state at the image point
     pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         type(scalar_field), &
             dimension(sys_size), &
             intent(IN) :: q_prim_vf !< Primitive Variables
@@ -804,11 +804,11 @@ contains
             end if
         end if
 
-        $:LOOP()
+        $:GPU_LOOP()
         do i = i1, i2
-            $:LOOP()
+            $:GPU_LOOP()
             do j = j1, j2
-                $:LOOP()
+                $:GPU_LOOP()
                 do k = k1, k2
 
                     coeff = gp%interp_coeffs(i - i1 + 1, j - j1 + 1, k - k1 + 1)
@@ -816,13 +816,13 @@ contains
                     pres_IP = pres_IP + coeff* &
                               q_prim_vf(E_idx)%sf(i, j, k)
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do q = momxb, momxe
                         vel_IP(q + 1 - momxb) = vel_IP(q + 1 - momxb) + coeff* &
                                                 q_prim_vf(q)%sf(i, j, k)
                     end do
 
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do l = contxb, contxe
                         alpha_rho_IP(l) = alpha_rho_IP(l) + coeff* &
                                           q_prim_vf(l)%sf(i, j, k)
@@ -835,7 +835,7 @@ contains
                     end if
 
                     if (bubbles_euler .and. .not. qbmm) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do l = 1, nb
                             if (polytropic) then
                                 r_IP(l) = r_IP(l) + coeff*q_prim_vf(bubxb + (l - 1)*2)%sf(i, j, k)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 6a6821e602..88f8621dbd 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -25,12 +25,12 @@ module m_mhd
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    $:DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
+    $:GPU_DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    $:DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
+    $:GPU_DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
 
 contains
 
@@ -53,12 +53,12 @@ contains
 
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, fd_number, fd_order)
-        $:UPDATE(device=["fd_coeff_x_h"])
+        $:GPU_UPDATE(device=["fd_coeff_x_h"])
         call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, fd_number, fd_order)
-        $:UPDATE(device=["fd_coeff_y_h"])
+        $:GPU_UPDATE(device=["fd_coeff_y_h"])
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, fd_number, fd_order)
-            $:UPDATE(device=["fd_coeff_z_h"])
+            $:GPU_UPDATE(device=["fd_coeff_z_h"])
         end if
 
     end subroutine s_initialize_mhd_powell_module
@@ -77,22 +77,22 @@ contains
         real(wp), dimension(3) :: v, B
         real(wp) :: divB, vdotB
 
-        $:PARALLEL_LOOP(collapse=3, private=["v", "B"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["v", "B"])
         do q = 0, p
             do l = 0, n
                 do k = 0, m
 
                     divB = 0._wp
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
                     end do
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
                     end do
                     if (p > 0) then
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do r = -fd_number, fd_number
                             divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
                         end do
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f271cb696e..c8e943e8d9 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -47,7 +47,7 @@ module m_mpi_proxy
     !> @{
     integer, private :: err_code, ierr, v_size
     !> @}
-    $:DECLARE(create=["v_size"])
+    $:GPU_DECLARE(create=["v_size"])
 
 contains
 
@@ -799,7 +799,7 @@ contains
             if (bc_x%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_x%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -835,7 +835,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send","ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send","ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -854,7 +854,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_x%beg
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = 0, gp_layers - 1
@@ -889,7 +889,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -909,12 +909,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
             ! Unpacking buffer received from bc_x%beg
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = -gp_layers, -1
@@ -930,7 +930,7 @@ contains
 
             if (bc_x%beg >= 0) then      ! PBC at the end and beginning
 
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 ! Packing buffer to be sent to bc_x%beg
                 do l = 0, p
                     do k = 0, n
@@ -967,7 +967,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
@@ -984,7 +984,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_x%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -1020,7 +1020,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
@@ -1038,11 +1038,11 @@ contains
             end if
 
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 
             ! Unpacking buffer received from bc_x%end
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = 0, n
                     do j = m + 1, m + gp_layers
@@ -1062,7 +1062,7 @@ contains
             if (bc_y%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_y%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1099,7 +1099,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1118,7 +1118,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1155,7 +1155,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1175,12 +1175,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
             ! Unpacking buffer received from bc_y%beg
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = -gp_layers, -1
                     do j = -gp_layers, m + gp_layers
@@ -1198,7 +1198,7 @@ contains
             if (bc_y%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1235,7 +1235,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1254,7 +1254,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_y%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1291,7 +1291,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1311,12 +1311,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
             ! Unpacking buffer received form bc_y%end
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = 0, p
                 do k = n + 1, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1336,7 +1336,7 @@ contains
             if (bc_z%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_z%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1374,7 +1374,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1393,7 +1393,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1430,7 +1430,7 @@ contains
                 else
 #endif
 
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1450,12 +1450,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
             ! Unpacking buffer from bc_z%beg
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = -gp_layers, -1
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1474,7 +1474,7 @@ contains
             if (bc_z%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1510,7 +1510,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1529,7 +1529,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_z%end
-                $:PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1566,7 +1566,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host=["ib_buff_send"])
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1586,12 +1586,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device=["ib_buff_recv"])
             end if
 #endif
 
             ! Unpacking buffer received from bc_z%end
-            $:PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
             do l = p + 1, p + gp_layers
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index dbc78a69d1..86ef374e1c 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -26,21 +26,21 @@ module m_qbmm
     private; public :: s_initialize_qbmm_module, s_mom_inv, s_coeff, s_compute_qbmm_rhs
 
     real(wp), allocatable, dimension(:, :, :, :, :) :: momrhs
-    $:DECLARE(create=["momrhs"])
+    $:GPU_DECLARE(create=["momrhs"])
 
     #:if MFC_CASE_OPTIMIZATION
         integer, parameter :: nterms = ${nterms}$
     #:else
         integer :: nterms
-        $:DECLARE(create=["nterms"])
+        $:GPU_DECLARE(create=["nterms"])
     #:endif
 
     type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm
-    $:DECLARE(create=["is1_qbmm","is2_qbmm","is3_qbmm"])
+    $:GPU_DECLARE(create=["is1_qbmm","is2_qbmm","is3_qbmm"])
 
     integer, allocatable, dimension(:) :: bubrs
     integer, allocatable, dimension(:, :) :: bubmoms
-    $:DECLARE(create=["bubrs","bubmoms"])
+    $:GPU_DECLARE(create=["bubrs","bubmoms"])
 
 contains
 
@@ -59,7 +59,7 @@ contains
             end if
 
             !$acc enter data copyin(nterms)
-            $:UPDATE(device=["nterms"])
+            $:GPU_UPDATE(device=["nterms"])
 
         #:endif
 
@@ -393,7 +393,7 @@ contains
             end do
         end if
 
-        $:UPDATE(device=["momrhs"])
+        $:GPU_UPDATE(device=["momrhs"])
 
         @:ALLOCATE(bubrs(1:nb))
         @:ALLOCATE(bubmoms(1:nb, 1:nmom))
@@ -401,14 +401,14 @@ contains
         do i = 1, nb
             bubrs(i) = bub_idx%rs(i)
         end do
-        $:UPDATE(device=["bubrs"])
+        $:GPU_UPDATE(device=["bubrs"])
 
         do j = 1, nmom
             do i = 1, nb
                 bubmoms(i, j) = bub_idx%moms(i, j)
             end do
         end do
-        $:UPDATE(device=["bubmoms"])
+        $:GPU_UPDATE(device=["bubmoms"])
 
     end subroutine s_initialize_qbmm_module
 
@@ -434,7 +434,7 @@ contains
         end select
 
         if (.not. polytropic) then
-            $:PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
+            $:GPU_PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
             do i = 1, nb
                 do q = 1, nnode
                     do l = 0, p
@@ -567,7 +567,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff_nonpoly
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -640,7 +640,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         real(wp), intent(in) :: pres, rho, c
@@ -716,9 +716,9 @@ contains
         integer :: id1, id2, id3, i1, i2, j, q, r
 
         is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz
-        $:UPDATE(device=["is1_qbmm","is2_qbmm","is3_qbmm"])
+        $:GPU_UPDATE(device=["is1_qbmm","is2_qbmm","is3_qbmm"])
 
-        $:PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
             "abscY", "wght_pb", "wght_mv", "wght_ht", "coeff", "ht", "r", "q", &
             "n_tait", "B_tait", "pres", "rho", "nbub", "c", "alf", "momsum", &
             "drdt", "drdt2", "chi_vw", "x_vw", "rho_mw", "k_mw", "T_bar", "grad_T"])
@@ -741,10 +741,10 @@ contains
 
                     if (alf > small_alf) then
                         nbub = q_cons_vf(bubxb)%sf(id1, id2, id3)
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, nb
                             ! Gather moments for this bubble bin
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do r = 2, nmom
                                 moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3)
                             end do
@@ -752,12 +752,12 @@ contains
                             call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q))
 
                             if (polytropic) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do j = 1, nnode
                                     wght_pb(j, q) = wght(j, q)*(pb0(q) - pv)
                                 end do
                             else
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do j = 1, nnode
                                     chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp))
                                     x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw)
@@ -776,13 +776,13 @@ contains
 
                             ! Compute change in moments due to bubble dynamics
                             r = 1
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i2 = 0, 2
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i1 = 0, 2
                                     if ((i1 + i2) <= 2) then
                                         momsum = 0._wp
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do j = 1, nterms
                                             select case (bubble_model)
                                             case (3)
@@ -812,7 +812,7 @@ contains
 
                             ! Compute change in pb and mv for non-polytropic model
                             if (.not. polytropic) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do j = 1, nnode
                                     drdt = msum(2)
                                     drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp)))
@@ -840,11 +840,11 @@ contains
                             end if
                         end if
                     else
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do q = 1, nb
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i1 = 0, 2
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i2 = 0, 2
                                     moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp
                                 end do
@@ -865,7 +865,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:ROUTINE()
+            $:GPU_ROUTINE()
 #endif
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
@@ -881,7 +881,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:ROUTINE()
+            $:GPU_ROUTINE()
 #endif
             real(wp), dimension(nmom), intent(in) :: momin
             real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
@@ -941,7 +941,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_hyqmom
 #else
-            $:ROUTINE()
+            $:GPU_ROUTINE()
 #endif
             real(wp), dimension(2), intent(inout) :: frho, fup
             real(wp), dimension(3), intent(in) :: fmom
@@ -960,7 +960,7 @@ contains
         end subroutine s_hyqmom
 
         pure function f_quad(abscX, abscY, wght_in, q, r, s)
-            $:ROUTINE()
+            $:GPU_ROUTINE()
             real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
             real(wp), intent(in) :: q, r, s
 
@@ -976,7 +976,7 @@ contains
         end function f_quad
 
         pure function f_quad2D(abscX, abscY, wght_in, pow)
-            $:ROUTINE()
+            $:GPU_ROUTINE()
             real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
             real(wp), dimension(3), intent(in) :: pow
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index e3023de18c..d87ac93811 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -76,13 +76,13 @@ module m_rhs
     !! conservative variables, which are located in q_cons_vf, at cell-interior
     !! Gaussian quadrature points (QP).
     type(vector_field) :: q_cons_qp !<
-    $:DECLARE(create=["q_cons_qp"])
+    $:GPU_DECLARE(create=["q_cons_qp"])
 
     !! The primitive variables at cell-interior Gaussian quadrature points. These
     !! are calculated from the conservative variables and gradient magnitude (GM)
     !! of the volume fractions, q_cons_qp and gm_alpha_qp, respectively.
     type(vector_field) :: q_prim_qp !<
-    $:DECLARE(create=["q_prim_qp"])
+    $:GPU_DECLARE(create=["q_prim_qp"])
 
     !> @name The first-order spatial derivatives of the primitive variables at cell-
     !! interior Gaussian quadrature points. These are WENO-reconstructed from
@@ -91,7 +91,7 @@ module m_rhs
     !! of the primitive variables, located in qK_prim_n, where K = L or R.
     !> @{
     type(vector_field), allocatable, dimension(:) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
-    $:DECLARE(create=["dq_prim_dx_qp","dq_prim_dy_qp","dq_prim_dz_qp"])
+    $:GPU_DECLARE(create=["dq_prim_dx_qp","dq_prim_dy_qp","dq_prim_dz_qp"])
     !> @}
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
@@ -101,26 +101,26 @@ module m_rhs
     !> @{
     type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n
     type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n
-    $:DECLARE(create=["dqL_prim_dx_n","dqL_prim_dy_n","dqL_prim_dz_n"])
-    $:DECLARE(create=["dqR_prim_dx_n","dqR_prim_dy_n","dqR_prim_dz_n"])
+    $:GPU_DECLARE(create=["dqL_prim_dx_n","dqL_prim_dy_n","dqL_prim_dz_n"])
+    $:GPU_DECLARE(create=["dqR_prim_dx_n","dqR_prim_dy_n","dqR_prim_dz_n"])
     !> @}
 
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
-    $:DECLARE(create=["tau_Re_vf"])
+    $:GPU_DECLARE(create=["tau_Re_vf"])
 
     type(vector_field) :: gm_alpha_qp  !<
     !! The gradient magnitude of the volume fractions at cell-interior Gaussian
     !! quadrature points. gm_alpha_qp is calculated from individual first-order
     !! spatial derivatives located in dq_prim_ds_qp.
 
-    $:DECLARE(create=["gm_alpha_qp"])
+    $:GPU_DECLARE(create=["gm_alpha_qp"])
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
     !! average gradient magnitude of volume fractions, located in gm_alpha_qp.
     !> @{
     type(vector_field), allocatable, dimension(:) :: gm_alphaL_n
     type(vector_field), allocatable, dimension(:) :: gm_alphaR_n
-    $:DECLARE(create=["gm_alphaL_n","gm_alphaR_n"])
+    $:GPU_DECLARE(create=["gm_alphaL_n","gm_alphaR_n"])
     !> @}
 
     !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical
@@ -130,38 +130,38 @@ module m_rhs
     type(vector_field), allocatable, dimension(:) :: flux_n
     type(vector_field), allocatable, dimension(:) :: flux_src_n
     type(vector_field), allocatable, dimension(:) :: flux_gsrc_n
-    $:DECLARE(create=["flux_n","flux_src_n","flux_gsrc_n"])
+    $:GPU_DECLARE(create=["flux_n","flux_src_n","flux_gsrc_n"])
     !> @}
 
     type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim
-    $:DECLARE(create=["qL_prim","qR_prim"])
+    $:GPU_DECLARE(create=["qL_prim","qR_prim"])
 
     type(int_bounds_info) :: iv !< Vector field indical bounds
-    $:DECLARE(create=["iv"])
+    $:GPU_DECLARE(create=["iv"])
 
     !> @name Indical bounds in the x-, y- and z-directions
     !> @{
     type(int_bounds_info) :: irx, iry, irz
-    $:DECLARE(create=["irx","iry","irz"])
+    $:GPU_DECLARE(create=["irx","iry","irz"])
 
     type(int_bounds_info) :: is1, is2, is3
-    $:DECLARE(create=["is1","is2","is3"])
+    $:GPU_DECLARE(create=["is1","is2","is3"])
 
     !> @name Saved fluxes for testing
     !> @{
     type(scalar_field) :: alf_sum
     !> @}
-    $:DECLARE(create=["alf_sum"])
+    $:GPU_DECLARE(create=["alf_sum"])
 
     real(wp), allocatable, dimension(:, :, :) :: blkmod1, blkmod2, alpha1, alpha2, Kterm
     real(wp), allocatable, dimension(:, :, :, :) :: qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf
     real(wp), allocatable, dimension(:, :, :, :) :: dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf
-    $:DECLARE(create=["blkmod1","blkmod2","alpha1","alpha2","Kterm"])
-    $:DECLARE(create=["qL_rsx_vf","qL_rsy_vf","qL_rsz_vf","qR_rsx_vf","qR_rsy_vf","qR_rsz_vf"])
-    $:DECLARE(create=["dqL_rsx_vf","dqL_rsy_vf","dqL_rsz_vf","dqR_rsx_vf","dqR_rsy_vf","dqR_rsz_vf"])
+    $:GPU_DECLARE(create=["blkmod1","blkmod2","alpha1","alpha2","Kterm"])
+    $:GPU_DECLARE(create=["qL_rsx_vf","qL_rsy_vf","qL_rsz_vf","qR_rsx_vf","qR_rsy_vf","qR_rsz_vf"])
+    $:GPU_DECLARE(create=["dqL_rsx_vf","dqL_rsy_vf","dqL_rsz_vf","dqR_rsx_vf","dqR_rsy_vf","dqR_rsz_vf"])
 
     real(wp), allocatable, dimension(:, :, :) :: nbub !< Bubble number density
-    $:DECLARE(create=["nbub"])
+    $:GPU_DECLARE(create=["nbub"])
 
 contains
 
@@ -175,7 +175,7 @@ contains
         integer :: num_eqns_after_adv
 
         !$acc enter data copyin(idwbuff, idwbuff)
-        $:UPDATE(device=["idwbuff", "idwbuff"])
+        $:GPU_UPDATE(device=["idwbuff", "idwbuff"])
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
         @:ALLOCATE(q_prim_qp%vf(1:sys_size))
@@ -569,7 +569,7 @@ contains
 
         call s_initialize_pressure_relaxation_module
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do id = 1, num_dims
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -609,7 +609,7 @@ contains
 
         call cpu_time(t_start)
         ! Association/Population of Working Variables
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -623,16 +623,16 @@ contains
         ! Converting Conservative to Primitive Variables
 
         if (mpp_lim .and. bubbles_euler) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         alf_sum%sf(j, k, l) = 0._wp
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = advxb, advxe - 1
                             alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l)
                         end do
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = advxb, advxe - 1
                             q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) &
                                                           /alf_sum%sf(j, k, l)
@@ -841,7 +841,7 @@ contains
         ! END: Dimensional Splitting Loop
 
         if (ib) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -905,7 +905,7 @@ contains
         ! END: Additional physics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -945,7 +945,7 @@ contains
         real(wp) :: advected_qty_val, pressure_val, velocity_val
 
         if (alt_soundspeed) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q_loop = 0, p
                 do l_loop = 0, n
                     do k_loop = 0, m
@@ -979,7 +979,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+            $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
             do j = 1, sys_size
                 do q_loop = 0, p
                     do l_loop = 0, n
@@ -994,7 +994,7 @@ contains
             end do
 
             if (model_eqns == 3) then
-                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
                     & "pressure_val","flux_face1","flux_face2"])
                 do q_loop = 0, p
                     do l_loop = 0, n
@@ -1024,7 +1024,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+            $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
             do j = 1, sys_size
                 do l = 0, p
                     do k = 0, n
@@ -1039,7 +1039,7 @@ contains
             end do
 
             if (model_eqns == 3) then
-                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
                     & "pressure_val","flux_face1","flux_face2"])
                 do l = 0, p
                     do k = 0, n
@@ -1065,7 +1065,7 @@ contains
             end if
 
             if (cyl_coord) then
-                $:PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do l = 0, p
                         do k = 0, n
@@ -1091,7 +1091,7 @@ contains
             end if
 
             if (grid_geometry == 3) then ! Cylindrical Coordinates
-                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","velocity_val", &
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","velocity_val", &
                     & "flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
@@ -1107,7 +1107,7 @@ contains
                         end do
                     end do
                 end do
-                $:PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1121,7 +1121,7 @@ contains
                     end do
                 end do
             else ! Cartesian Coordinates
-                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1137,7 +1137,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                $:PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
+                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
                     & "pressure_val","flux_face1","flux_face2"])
                 do k = 0, p
                     do q = 0, n
@@ -1182,7 +1182,7 @@ contains
             case (1) ! x-direction
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                         & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do q_idx = 0, p ! z_extent
@@ -1201,7 +1201,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", "local_flux2"])
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
@@ -1215,7 +1215,7 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds",&
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds",&
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1231,7 +1231,7 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                             & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
@@ -1249,7 +1249,7 @@ contains
             case (2) ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                         & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do l_idx = 0, p ! z_extent
@@ -1268,7 +1268,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1287,7 +1287,7 @@ contains
                                         end if
                                     end do; end do; end do
 
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1307,7 +1307,7 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                             & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
@@ -1330,7 +1330,7 @@ contains
                 end if
 
                 if (use_standard_riemann) then
-                    $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                         & "local_term_coeff","local_flux1","local_flux2"])
                     do j_adv = advxb, advxe
                         do k_idx = 0, p ! z_extent
@@ -1349,7 +1349,7 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1364,7 +1364,7 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
+                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
                                 "local_q_cons_val", "local_k_term_val", &
                                 "local_term_coeff", "local_flux1", &
                                 "local_flux2"])
@@ -1380,7 +1380,7 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
+                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
                             & "local_term_coeff","local_flux1","local_flux2"])
                         do j_adv = advxb, advxe
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
@@ -1413,7 +1413,7 @@ contains
         if (idir == 1) then ! x-direction
 
             if (surface_tension) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1427,11 +1427,11 @@ contains
                 end do
             end if
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* &
@@ -1445,7 +1445,7 @@ contains
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1477,10 +1477,10 @@ contains
                                                              idwbuff(1), idwbuff(2), idwbuff(3))
                     end if
 
-                    $:PARALLEL_LOOP(collapse=2)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do l = 0, p
                         do j = 0, m
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, 0, l) = &
                                     rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* &
@@ -1492,11 +1492,11 @@ contains
 
                 end if
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1508,11 +1508,11 @@ contains
                 end do
 
             else
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1529,11 +1529,11 @@ contains
             if (cyl_coord) then
                 if ((bc_y%beg == BC_REFLECTIVE) .or. (bc_y%beg == BC_AXIS)) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1545,10 +1545,10 @@ contains
                     end do
 
                     if (viscous) then
-                        $:PARALLEL_LOOP(collapse=2)
+                        $:GPU_PARALLEL_LOOP(collapse=2)
                         do l = 0, p
                             do j = 0, m
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, 0, l) = &
                                         rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* &
@@ -1559,11 +1559,11 @@ contains
                     end if
                 else
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1580,7 +1580,7 @@ contains
         elseif (idir == 3) then ! z-direction
 
             if (surface_tension) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1594,11 +1594,11 @@ contains
                 end do
             end if
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* &
@@ -1610,7 +1610,7 @@ contains
             end do
 
             if (grid_geometry == 3) then
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1730,10 +1730,10 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1","is2","is3","iv"])
+        $:GPU_UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1746,7 +1746,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 2) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1759,7 +1759,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 3) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 6d0c7fb7d8..d81659c2b4 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -66,7 +66,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
-    $:DECLARE(create=["flux_rsx_vf","flux_src_rsx_vf","flux_rsy_vf","flux_src_rsy_vf","flux_rsz_vf","flux_src_rsz_vf"])
+    $:GPU_DECLARE(create=["flux_rsx_vf","flux_src_rsx_vf","flux_rsy_vf","flux_src_rsy_vf","flux_rsz_vf","flux_src_rsz_vf"])
     !> @}
 
     !> The cell-boundary values of the geometrical source flux that are computed
@@ -77,7 +77,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
-    $:DECLARE(create=["flux_gsrc_rsx_vf","flux_gsrc_rsy_vf","flux_gsrc_rsz_vf"])
+    $:GPU_DECLARE(create=["flux_gsrc_rsx_vf","flux_gsrc_rsy_vf","flux_gsrc_rsz_vf"])
     !> @}
 
     ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
@@ -86,17 +86,17 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
-    $:DECLARE(create=["vel_src_rsx_vf","vel_src_rsy_vf","vel_src_rsz_vf"])
+    $:GPU_DECLARE(create=["vel_src_rsx_vf","vel_src_rsy_vf","vel_src_rsz_vf"])
 
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
-    $:DECLARE(create=["mom_sp_rsx_vf","mom_sp_rsy_vf","mom_sp_rsz_vf"])
+    $:GPU_DECLARE(create=["mom_sp_rsx_vf","mom_sp_rsy_vf","mom_sp_rsz_vf"])
 
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
-    $:DECLARE(create=["Re_avg_rsx_vf","Re_avg_rsy_vf","Re_avg_rsz_vf"])
+    $:GPU_DECLARE(create=["Re_avg_rsx_vf","Re_avg_rsy_vf","Re_avg_rsz_vf"])
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
@@ -104,13 +104,13 @@ module m_riemann_solvers
     type(int_bounds_info) :: isx, isy, isz
     !> @}
 
-    $:DECLARE(create=["is1","is2","is3","isx","isy","isz"])
+    $:GPU_DECLARE(create=["is1","is2","is3","isx","isy","isz"])
 
     real(wp), allocatable, dimension(:) :: Gs
-    $:DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create=["Gs"])
 
     real(wp), allocatable, dimension(:, :) :: Res
-    $:DECLARE(create=["Res"])
+    $:GPU_DECLARE(create=["Res"])
 
 contains
 
@@ -355,7 +355,7 @@ contains
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
-                $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
+                $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
                     "vel_L", "vel_R", "alpha_L", "alpha_R", "tau_e_L", "tau_e_R", &
                     "G_L", "G_R", "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
                     "s_L", "s_R", "s_S", "Ys_L", "Ys_R", "xi_field_L", "xi_field_R", &
@@ -366,13 +366,13 @@ contains
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, contxe
                                 alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                 alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                             end do
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_vels
                                 vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                 vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -380,13 +380,13 @@ contains
 
                             vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_vels
                                 vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                 vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                             end do
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                 alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -430,7 +430,7 @@ contains
                             pres_mag%R = 0._wp
 
                             if (mpp_lim) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
                                     alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
@@ -439,7 +439,7 @@ contains
 
                                 alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
                                     alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
@@ -449,7 +449,7 @@ contains
                                 alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho_L = rho_L + alpha_rho_L(i)
                                 gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -463,13 +463,13 @@ contains
                             end do
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_L(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res(i, q) &
                                                   + Re_L(i)
@@ -479,13 +479,13 @@ contains
 
                                 end do
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_R(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_R(i) = alpha_R(Re_idx(i, q))/Res(i, q) &
                                                   + Re_R(i)
@@ -496,7 +496,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = chemxb, chemxe
                                     Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -593,7 +593,7 @@ contains
                             if (hypoelasticity) then
                                 G_L = 0._wp; G_R = 0._wp
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     G_L = G_L + alpha_L(i)*Gs(i)
                                     G_R = G_R + alpha_R(i)*Gs(i)
@@ -626,7 +626,7 @@ contains
                             !    G_L = 0._wp
                             !    G_R = 0._wp
                             !
-                            !    $:LOOP()
+                            !    $:GPU_LOOP()
                             !    do i = 1, num_fluids
                             !        G_L = G_L + alpha_L(i)*Gs(i)
                             !        G_R = G_R + alpha_R(i)*Gs(i)
@@ -635,17 +635,17 @@ contains
                             !    if ((G_L > 1e-3_wp) .and. (G_R > 1e-3_wp)) then
                             !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                             !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                            !    $:LOOP()
+                            !    $:GPU_LOOP()
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                             !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                             !    end do
-                            !    $:LOOP()
+                            !    $:GPU_LOOP()
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = 0_wp
                             !        tau_e_R(i) = 0_wp
                             !    end do
-                            !    $:LOOP()
+                            !    $:GPU_LOOP()
                             !    do i = 1, num_dims
                             !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                             !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
@@ -673,7 +673,7 @@ contains
                             end if
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                 end do
@@ -752,7 +752,7 @@ contains
 
                             ! Mass
                             if (.not. relativity) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -762,7 +762,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (relativity) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -843,7 +843,7 @@ contains
                                      + s_M*s_P*(cm%L(3) - cm%R(3))) &
                                     /(s_M - s_P)
                             elseif (bubbles_euler) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -858,7 +858,7 @@ contains
                                         + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
                                 end do
                             else if (hypoelasticity) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -874,7 +874,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             else
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -970,7 +970,7 @@ contains
                             end if
 
                             ! Advection
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = &
                                     (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -995,7 +995,7 @@ contains
                             !end if
 
                             ! Div(U)?
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_vels
                                 vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                     (xi_M*(rho_L*vel_L(dir_idx(i))* &
@@ -1016,7 +1016,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = chemxb, chemxe
                                     Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -1066,7 +1066,7 @@ contains
                             #:if (NORM_DIR == 2)
                                 if (cyl_coord) then
                                     !Substituting the advective flux into the inviscid geometrical source flux
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, E_idx
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1075,7 +1075,7 @@ contains
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
                                         - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
                                     ! Geometrical source of the void fraction(s) is zero
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = advxb, advxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1088,7 +1088,7 @@ contains
                                         (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
                                         /(s_M - s_P)
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = strxb, strxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1284,7 +1284,7 @@ contains
                 ! 6-EQUATION MODEL WITH HLLC
                 if (model_eqns == 3) then
                     !ME3
-                    $:PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
+                    $:GPU_PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
                         "vel_K_Star", "Re_L", "Re_R", "rho_avg", "h_avg", &
                         "gamma_avg", "s_L", "s_R", "s_S", "vel_avg_rms", &
                         "alpha_L", "alpha_R", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
@@ -1300,7 +1300,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -1325,32 +1325,32 @@ contains
                                 alpha_R_sum = 0._wp
 
                                 if (mpp_lim) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1367,13 +1367,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -1383,13 +1383,13 @@ contains
 
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -1405,18 +1405,18 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0_wp; G_R = 0_wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -1433,13 +1433,13 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0_wp; G_R = 0_wp; 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -1450,7 +1450,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -1474,7 +1474,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -1569,7 +1569,7 @@ contains
 
                                 ! COMPUTING FLUXES
                                 ! MASS FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + &
@@ -1578,7 +1578,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* &
@@ -1594,7 +1594,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0_wp; 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -1612,7 +1612,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
@@ -1620,7 +1620,7 @@ contains
                                 end do
 
                                 ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -1630,7 +1630,7 @@ contains
 
                                 ! INTERNAL ENERGIES ADVECTION FLUX.
                                 ! K-th pressure and velocity in preparation for the internal energy flux
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1_wp + gammas(i)))* &
                                                             xi_L**(1_wp/gammas(i) + 1_wp) - pi_infs(i)/(1_wp + gammas(i)) - pres_L) + pres_L) + &
@@ -1649,7 +1649,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -1659,7 +1659,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -1680,11 +1680,11 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = intxb, intxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1692,7 +1692,7 @@ contains
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0_wp
                                         end do
@@ -1700,7 +1700,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0_wp
                                         end do
@@ -1717,7 +1717,7 @@ contains
 
                 elseif (model_eqns == 4) then
                     !ME4
-                    $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
+                    $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
                         "alpha_rho_R", "vel_L", "vel_R", "alpha_L", "alpha_R", &
                         "rho_avg", "h_avg", "gamma_avg", "s_L", "s_R", "s_S", &
                         "vel_avg_rms", "nbub_L", "nbub_R", "ptilde_L", "ptilde_R"])
@@ -1725,26 +1725,26 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                 end do
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                     vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                                 end do
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1757,7 +1757,7 @@ contains
                                 gamma_L = 0._wp
                                 pi_inf_L = 0._wp
                                 qv_L = 0._wp
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + alpha_rho_L(i)
                                     gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -1769,7 +1769,7 @@ contains
                                 gamma_R = 0._wp
                                 pi_inf_R = 0._wp
                                 qv_R = 0._wp
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_R = rho_R + alpha_rho_R(i)
                                     gamma_R = gamma_R + alpha_R(i)*gammas(i)
@@ -1844,7 +1844,7 @@ contains
                                 xi_M = (5e-1_wp + sign(5e-1_wp, s_S))
                                 xi_P = (5e-1_wp - sign(5e-1_wp, s_S))
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*alpha_rho_L(i) &
@@ -1855,7 +1855,7 @@ contains
 
                                 ! Momentum flux.
                                 ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -1874,7 +1874,7 @@ contains
 
                                 if (bubbles_euler) then
                                     ! Put p_tilde in
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                             flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
@@ -1885,7 +1885,7 @@ contains
 
                                 flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = alf_idx, alf_idx !only advect the void fraction
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1895,7 +1895,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
 
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
@@ -1906,7 +1906,7 @@ contains
 
                                 ! Add advection flux for bubble variables
                                 if (bubbles_euler) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = bubxb, bubxe
                                         flux_rs${XYZ}$_vf(j, k, l, i) = &
                                             xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1921,7 +1921,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1938,7 +1938,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1946,7 +1946,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1970,7 +1970,7 @@ contains
                     !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
-                    $:PARALLEL_LOOP(collapse=3, private=["R0_L", "R0_R", "V0_L", &
+                    $:GPU_PARALLEL_LOOP(collapse=3, private=["R0_L", "R0_R", "V0_L", &
                         "V0_R", "P0_L", "P0_R", "pbw_L", "pbw_R", "vel_L", &
                         "vel_R", "rho_avg", "alpha_L", "alpha_R", "h_avg", &
                         "gamma_avg", "s_L", "s_R", "s_S", "nbub_L", "nbub_R", &
@@ -1980,7 +1980,7 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1988,7 +1988,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2006,7 +2006,7 @@ contains
 
                                 ! Retain this in the refactor
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2014,7 +2014,7 @@ contains
                                         qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids - 1
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2034,7 +2034,7 @@ contains
                                 qv_R = 0._wp
 
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2042,7 +2042,7 @@ contains
                                         qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids - 1
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2058,13 +2058,13 @@ contains
 
                                 if (viscous) then
                                     if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, 2
                                             Re_L(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                            $:LOOP()
+                                            $:GPU_LOOP()
                                             do q = 1, Re_size(i)
                                                 Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_L(i)
@@ -2074,13 +2074,13 @@ contains
 
                                         end do
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, 2
                                             Re_R(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                            $:LOOP()
+                                            $:GPU_LOOP()
                                             do q = 1, Re_size(i)
                                                 Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_R(i)
@@ -2099,7 +2099,7 @@ contains
                                 H_R = (E_R + pres_R)/rho_R
 
                                 if (avg_state == 2) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, nb
                                         R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
                                         R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
@@ -2119,7 +2119,7 @@ contains
                                         else
                                             nbub_L_denom = 0._wp
                                             nbub_R_denom = 0._wp
-                                            $:LOOP()
+                                            $:GPU_LOOP()
                                             do i = 1, nb
                                                 nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i)
                                                 nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i)
@@ -2133,7 +2133,7 @@ contains
                                         nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
                                     end if
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, nb
                                         if (.not. qbmm) then
                                             if (polytropic) then
@@ -2166,7 +2166,7 @@ contains
                                         R3V2Lbar = 0._wp
                                         R3V2Rbar = 0._wp
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, nb
                                             PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
                                             PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
@@ -2201,7 +2201,7 @@ contains
                                     gamma_avg = 5e-1_wp*(gamma_L + gamma_R)
                                     vel_avg_rms = 0._wp
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
                                     end do
@@ -2220,7 +2220,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2284,7 +2284,7 @@ contains
                                     pcorr = 0._wp
                                 end if
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2303,7 +2303,7 @@ contains
 
                                 ! Include p_tilde
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -2335,7 +2335,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
 
                                 ! Volume fraction flux
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2345,7 +2345,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                         xi_M*(vel_L(dir_idx(i)) + &
@@ -2361,7 +2361,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
 
                                 ! Add advection flux for bubble variables
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = bubxb, bubxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2390,7 +2390,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2407,7 +2407,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2415,7 +2415,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2441,7 +2441,7 @@ contains
                     !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
-                    $:PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
+                    $:GPU_PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
                         "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
                         "alpha_L", "alpha_R", "s_L", "s_R", "s_S", &
                         "vel_avg_rms", "pcorr", "zcoef", "vel_L_tmp", &
@@ -2456,14 +2456,14 @@ contains
 
                                 !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2490,32 +2490,32 @@ contains
                                 ! Change this by splitting it into the cases
                                 ! present in the bubbles_euler
                                 if (mpp_lim) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2529,13 +2529,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -2545,13 +2545,13 @@ contains
 
                                     end do
 
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -2563,7 +2563,7 @@ contains
 
                                 if (chemistry) then
                                     c_sum_Yi_Phi = 0.0_wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = chemxb, chemxe
                                         Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2622,19 +2622,19 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0_wp
                                     G_R = 0_wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -2651,14 +2651,14 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0_wp
                                     G_R = 0_wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -2669,7 +2669,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -2693,7 +2693,7 @@ contains
                                                               vel_avg_rms, c_sum_Yi_Phi, c_avg)
 
                                 if (viscous) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2769,7 +2769,7 @@ contains
 
                                 ! COMPUTING THE HLLC FLUXES
                                 ! MASS FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2780,7 +2780,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = &
@@ -2815,7 +2815,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0_wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -2834,7 +2834,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -2843,7 +2843,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2853,7 +2853,7 @@ contains
                                 end do
 
                                 ! VOLUME FRACTION SOURCE FLUX.
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -2876,7 +2876,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -2889,7 +2889,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1)
 
                                 if (chemistry) then
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do i = chemxb, chemxe
                                         Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2904,7 +2904,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2921,7 +2921,7 @@ contains
                                                                       (1._wp - dir_flg(idx1))* &
                                                                       vel_R(idx1)) - vel_R(idx1))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2929,7 +2929,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:LOOP()
+                                        $:GPU_LOOP()
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -3066,7 +3066,7 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                $:PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
+                $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
                     "alpha_rho_R", "vel", "alpha_L", "alpha_R", "rho", "pres", &
                     "E", "H_no_mag", "gamma", "pi_inf", "qv", "vel_rms", "B", &
                     "c", "c_fast", "pres_mag", "U_L", "U_R", "U_starL", &
@@ -3121,7 +3121,7 @@ contains
                             ! Sum properties of all fluid components
                             rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
                             rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho%L = rho%L + alpha_rho_L(i)
                                 gamma%L = gamma%L + alpha_L(i)*gammas(i)
@@ -3291,7 +3291,7 @@ contains
                             ! Energy
                             flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
                             ! Partial fraction
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
                             end do
@@ -3323,7 +3323,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device=["Gs"])
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -3335,7 +3335,7 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            $:UPDATE(device=["Res","Re_idx","Re_size"])
+            $:GPU_UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 
         !$acc enter data copyin(is1, is2, is3, isx, isy, isz)
@@ -3481,7 +3481,7 @@ contains
             dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
         end if
 
-        $:UPDATE(device=["is1","is2","is3"])
+        $:GPU_UPDATE(device=["is1","is2","is3"])
 
         if (elasticity) then
             if (norm_dir == 1) then
@@ -3495,15 +3495,15 @@ contains
 
         isx = ix; isy = iy; isz = iz
         ! for stuff in the same module
-        $:UPDATE(device=["isx","isy","isz"])
+        $:GPU_UPDATE(device=["isx","isy","isz"])
         ! for stuff in different modules
-        $:UPDATE(device=["dir_idx","dir_flg","dir_idx_tau"])
+        $:GPU_UPDATE(device=["dir_idx","dir_flg","dir_idx_tau"])
 
         ! Population of Buffers in x-direction
         if (norm_dir == 1) then
 
             if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3514,7 +3514,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3526,7 +3526,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        $:PARALLEL_LOOP(collapse=3)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3538,7 +3538,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            $:PARALLEL_LOOP(collapse=3)
+                            $:GPU_PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3558,7 +3558,7 @@ contains
 
             if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3570,7 +3570,7 @@ contains
 
                 if (viscous) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3582,7 +3582,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        $:PARALLEL_LOOP(collapse=3)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3594,7 +3594,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            $:PARALLEL_LOOP(collapse=3)
+                            $:GPU_PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3617,7 +3617,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3629,7 +3629,7 @@ contains
 
                 if (viscous) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3639,7 +3639,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3650,7 +3650,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        $:PARALLEL_LOOP(collapse=3)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3667,7 +3667,7 @@ contains
 
             if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3679,7 +3679,7 @@ contains
 
                 if (viscous) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3689,7 +3689,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3700,7 +3700,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        $:PARALLEL_LOOP(collapse=3)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3720,7 +3720,7 @@ contains
         else
 
             if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3731,7 +3731,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3740,7 +3740,7 @@ contains
                             end do
                         end do
                     end do
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3749,7 +3749,7 @@ contains
                             end do
                         end do
                     end do
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3764,7 +3764,7 @@ contains
 
             if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3775,7 +3775,7 @@ contains
                 end do
 
                 if (viscous) then
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3785,7 +3785,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3795,7 +3795,7 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3847,7 +3847,7 @@ contains
 
             if (viscous .or. (surface_tension)) then
 
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3861,7 +3861,7 @@ contains
 
             if (qbmm) then
 
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3877,7 +3877,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (viscous .or. (surface_tension)) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -3890,7 +3890,7 @@ contains
             end if
 
             if (qbmm) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3906,7 +3906,7 @@ contains
         else
 
             if (viscous .or. (surface_tension)) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -3919,7 +3919,7 @@ contains
             end if
 
             if (qbmm) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3985,7 +3985,7 @@ contains
         integer :: i_vel             !!< Loop iterator for velocity components.
         integer :: idx_rp(3)         !!< Indices $(j,k,l)$ of 'right' point for averaging.
 
-        $:PARALLEL_LOOP(collapse=3, private=["idx_rp", "avg_v_int", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["idx_rp", "avg_v_int", &
             "avg_dvdx_int", "avg_dvdy_int", "avg_dvdz_int", "Re_s", "Re_b", &
             "vel_src_int", "r_eff", "divergence_cyl", "stress_vector_shear", &
             "stress_normal_bulk", "div_v_term_const"])
@@ -3999,7 +3999,7 @@ contains
 
                     ! Average velocities and their derivatives at the interface
                     ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i_vel = 1, num_dims
                         avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
 
@@ -4077,7 +4077,7 @@ contains
                             end if
                         end select
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i_vel = 1, num_dims
                             flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
                             flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
@@ -4148,7 +4148,7 @@ contains
 
         real(wp) :: divergence_v   !< Velocity divergence at interface.
 
-        $:PARALLEL_LOOP(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
             "current_tau_shear", "current_tau_bulk", "vel_src_at_interface", &
             "Re_shear", "Re_bulk", "divergence_v", "i_dim", "vel_comp_idx"])
         do l_loop = isz%beg, isz%end
@@ -4242,7 +4242,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
     pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         implicit none
 
@@ -4276,7 +4276,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
     pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 
         implicit none
 
@@ -4316,7 +4316,7 @@ contains
 
         ! Reshaping Outputted Data in y-direction
         if (norm_dir == 2) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do j = is1%beg, is1%end
@@ -4329,7 +4329,7 @@ contains
             end do
 
             if (cyl_coord) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4342,7 +4342,7 @@ contains
                 end do
             end if
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4353,7 +4353,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4368,7 +4368,7 @@ contains
             end if
             ! Reshaping Outputted Data in z-direction
         elseif (norm_dir == 3) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4381,7 +4381,7 @@ contains
                 end do
             end do
             if (grid_geometry == 3) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4395,7 +4395,7 @@ contains
                 end do
             end if
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do j = is1%beg, is1%end
                 do k = is2%beg, is2%end
                     do l = is3%beg, is3%end
@@ -4406,7 +4406,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4420,7 +4420,7 @@ contains
 
             end if
         elseif (norm_dir == 1) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -4432,7 +4432,7 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = is1%beg, is1%end
@@ -4443,7 +4443,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 5dc7ee30a7..2ff31ea283 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -34,7 +34,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_enthalpy
 #else
-        $:ROUTINE()
+        $:GPU_ROUTINE()
 #endif
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
@@ -49,7 +49,7 @@ contains
 
         integer :: i
 
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_fluids
             alpha_rho(i) = q_prim_vf(i)%sf(j, k, l)
             alpha(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
@@ -64,13 +64,13 @@ contains
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re)
         end if
 
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_vels
             vel(i) = q_prim_vf(contxe + i)%sf(j, k, l)
         end do
 
         vel_sum = 0._wp
-        $:LOOP()
+        $:GPU_LOOP()
         do i = 1, num_vels
             vel_sum = vel_sum + vel(i)**2._wp
         end do
@@ -99,7 +99,7 @@ contains
         !! @param vcfl_sf (optional) cell centered viscous cfl number
         !! @param Rc_sf (optional) cell centered Rc
     pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
@@ -196,7 +196,7 @@ contains
         !! @param k y coordinate
         !! @param l z coordinate
     pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
-        $:ROUTINE()
+        $:GPU_ROUTINE()
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt
@@ -275,17 +275,17 @@ contains
 
         bc_type(1, -1)%sf(:, :, :) = bc_x%beg
         bc_type(1, 1)%sf(:, :, :) = bc_x%end
-        $:UPDATE(device=["bc_type(1,-1)%sf","bc_type(1,1)%sf"])
+        $:GPU_UPDATE(device=["bc_type(1,-1)%sf","bc_type(1,1)%sf"])
 
         if (n > 0) then
             bc_type(2, -1)%sf(:, :, :) = bc_y%beg
             bc_type(2, 1)%sf(:, :, :) = bc_y%end
-            $:UPDATE(device=["bc_type(2,-1)%sf","bc_type(2,1)%sf"])
+            $:GPU_UPDATE(device=["bc_type(2,-1)%sf","bc_type(2,1)%sf"])
 
             if (p > 0) then
                 bc_type(3, -1)%sf(:, :, :) = bc_z%beg
                 bc_type(3, 1)%sf(:, :, :) = bc_z%end
-                $:UPDATE(device=["bc_type(3,-1)%sf","bc_type(3,1)%sf"])
+                $:GPU_UPDATE(device=["bc_type(3,-1)%sf","bc_type(3,1)%sf"])
             end if
         end if
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index bd4ea50666..981c9f8020 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1263,12 +1263,12 @@ contains
         if (cfl_dt) then
             if ((mytime + dt) >= t_stop) then
                 dt = t_stop - mytime
-                $:UPDATE(device=["dt"])
+                $:GPU_UPDATE(device=["dt"])
             end if
         else
             if ((mytime + dt) >= finaltime) then
                 dt = finaltime - mytime
-                $:UPDATE(device=["dt"])
+                $:GPU_UPDATE(device=["dt"])
             end if
         end if
 
@@ -1292,7 +1292,7 @@ contains
 
         if (probe_wrt) then
             do i = 1, sys_size
-                $:UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
+                $:GPU_UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
             end do
         end if
 
@@ -1395,7 +1395,7 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 1, sys_size
-            $:UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
+            $:GPU_UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1409,8 +1409,8 @@ contains
         end do
 
         if (qbmm .and. .not. polytropic) then
-            $:UPDATE(host=["pb_ts(1)%sf"])
-            $:UPDATE(host=["mv_ts(1)%sf"])
+            $:GPU_UPDATE(host=["pb_ts(1)%sf"])
+            $:GPU_UPDATE(host=["mv_ts(1)%sf"])
         end if
 
         if (cfl_dt) then
@@ -1420,16 +1420,16 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            $:UPDATE(host=["intfc_rad"])
+            $:GPU_UPDATE(host=["intfc_rad"])
             do i = 1, nBubs
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
             end do
 
-            $:UPDATE(host=["q_beta%vf(1)%sf"])
+            $:GPU_UPDATE(host=["q_beta%vf(1)%sf"])
             call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, q_beta%vf(1))
-            $:UPDATE(host=["Rmax_stats","Rmin_stats","gas_p","gas_mv","intfc_vel"])
+            $:GPU_UPDATE(host=["Rmax_stats","Rmin_stats","gas_p","gas_mv","intfc_vel"])
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
@@ -1626,44 +1626,44 @@ contains
         integer :: i
         !Update GPU DATA
         do i = 1, sys_size
-            $:UPDATE(device=["q_cons_ts(1)%vf(i)%sf"])
+            $:GPU_UPDATE(device=["q_cons_ts(1)%vf(i)%sf"])
         end do
 
         if (qbmm .and. .not. polytropic) then
-            $:UPDATE(device=["pb_ts(1)%sf","mv_ts(1)%sf"])
+            $:GPU_UPDATE(device=["pb_ts(1)%sf","mv_ts(1)%sf"])
         end if
         if (chemistry) then
-            $:UPDATE(device=["q_T_sf%sf"])
+            $:GPU_UPDATE(device=["q_T_sf%sf"])
         end if
-        $:UPDATE(device=["nb","R0ref","Ca","Web","Re_inv","weight","R0","V0", &
+        $:GPU_UPDATE(device=["nb","R0ref","Ca","Web","Re_inv","weight","R0","V0", &
             & "bubbles_euler","polytropic","polydisperse","qbmm","R0_type", &
             & "ptil","bubble_model","thermal","poly_sigma","adv_n","adap_dt", &
             & "adap_dt_tol","n_idx","pi_fac","low_Mach"])
-        $:UPDATE(device=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv","M_n", &
+        $:GPU_UPDATE(device=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv","M_n", &
             & "M_v","k_n","k_v","pb0","mass_n0","mass_v0","Pe_T","Re_trans_T", &
             & "Re_trans_c","Im_trans_T","Im_trans_c","omegaN","mul0","ss", &
             & "gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
 
-        $:UPDATE(device=["acoustic_source", "num_source"])
-        $:UPDATE(device=["sigma", "surface_tension"])
+        $:GPU_UPDATE(device=["acoustic_source", "num_source"])
+        $:GPU_UPDATE(device=["sigma", "surface_tension"])
 
-        $:UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
+        $:GPU_UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
 
-        $:UPDATE(device=["bc_x%vb1","bc_x%vb2","bc_x%vb3","bc_x%ve1","bc_x%ve2","bc_x%ve3"])
-        $:UPDATE(device=["bc_y%vb1","bc_y%vb2","bc_y%vb3","bc_y%ve1","bc_y%ve2","bc_y%ve3"])
-        $:UPDATE(device=["bc_z%vb1","bc_z%vb2","bc_z%vb3","bc_z%ve1","bc_z%ve2","bc_z%ve3"])
+        $:GPU_UPDATE(device=["bc_x%vb1","bc_x%vb2","bc_x%vb3","bc_x%ve1","bc_x%ve2","bc_x%ve3"])
+        $:GPU_UPDATE(device=["bc_y%vb1","bc_y%vb2","bc_y%vb3","bc_y%ve1","bc_y%ve2","bc_y%ve3"])
+        $:GPU_UPDATE(device=["bc_z%vb1","bc_z%vb2","bc_z%vb3","bc_z%ve1","bc_z%ve2","bc_z%ve3"])
 
-        $:UPDATE(device=["bc_x%grcbc_in","bc_x%grcbc_out","bc_x%grcbc_vel_out"])
-        $:UPDATE(device=["bc_y%grcbc_in","bc_y%grcbc_out","bc_y%grcbc_vel_out"])
-        $:UPDATE(device=["bc_z%grcbc_in","bc_z%grcbc_out","bc_z%grcbc_vel_out"])
+        $:GPU_UPDATE(device=["bc_x%grcbc_in","bc_x%grcbc_out","bc_x%grcbc_vel_out"])
+        $:GPU_UPDATE(device=["bc_y%grcbc_in","bc_y%grcbc_out","bc_y%grcbc_vel_out"])
+        $:GPU_UPDATE(device=["bc_z%grcbc_in","bc_z%grcbc_out","bc_z%grcbc_vel_out"])
 
-        $:UPDATE(device=["relax", "relax_model"])
+        $:GPU_UPDATE(device=["relax", "relax_model"])
         if (relax) then
-            $:UPDATE(device=["palpha_eps", "ptgalpha_eps"])
+            $:GPU_UPDATE(device=["palpha_eps", "ptgalpha_eps"])
         end if
 
         if (ib) then
-            $:UPDATE(device=["ib_markers%sf"])
+            $:GPU_UPDATE(device=["ib_markers%sf"])
         end if
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 6631933e2b..4433fc9926 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -30,16 +30,16 @@ module m_surface_tension
     !> @{
     type(scalar_field), allocatable, dimension(:) :: c_divs
     !> @)
-    $:DECLARE(create=["c_divs"])
+    $:GPU_DECLARE(create=["c_divs"])
 
     !> @name cell boundary reconstructed gradient components and magnitude
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: gL_x, gR_x, gL_y, gR_y, gL_z, gR_z
     !> @}
-    $:DECLARE(create=["gL_x","gR_x","gL_y","gR_y","gL_z","gR_z"])
+    $:GPU_DECLARE(create=["gL_x","gR_x","gL_y","gR_y","gL_z","gR_z"])
 
     type(int_bounds_info) :: is1, is2, is3, iv
-    $:DECLARE(create=["is1","is2","is3","iv"])
+    $:GPU_DECLARE(create=["is1","is2","is3","iv"])
 
 contains
 
@@ -86,7 +86,7 @@ contains
         integer :: j, k, l, i
 
         if (id == 1) then
-            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", &
                 "normWR", "normW"])
             do l = isz%beg, isz%end
@@ -133,7 +133,7 @@ contains
 
         elseif (id == 2) then
 
-            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
                 "normW"])
             do l = isz%beg, isz%end
@@ -180,7 +180,7 @@ contains
 
         elseif (id == 3) then
 
-            $:PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
                 "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
                 "normW"])
             do l = isz%beg, isz%end
@@ -244,7 +244,7 @@ contains
         isx%end = m; isy%end = n; isz%end = p
 
         ! compute gradient components
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -254,7 +254,7 @@ contains
             end do
         end do
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -265,7 +265,7 @@ contains
         end do
 
         if (p > 0) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -276,7 +276,7 @@ contains
             end do
         end if
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -336,10 +336,10 @@ contains
 
         end if
 
-        $:UPDATE(device=["is1","is2","is3","iv"])
+        $:GPU_UPDATE(device=["is1","is2","is3","iv"])
 
         if (recon_dir == 1) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -352,7 +352,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 2) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -365,7 +365,7 @@ contains
             end do
             !$acc end parallel loop
         else if (recon_dir == 3) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index b9d2594a05..4d9e28e6a5 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -78,7 +78,7 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    $:DECLARE(create=["q_cons_ts","q_prim_vf","q_T_sf","rhs_vf","q_prim_ts","rhs_mv","rhs_pb","max_dt"])
+    $:GPU_DECLARE(create=["q_cons_ts","q_prim_vf","q_T_sf","rhs_vf","q_prim_ts","rhs_mv","rhs_pb","max_dt"])
 
 contains
 
@@ -380,7 +380,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -395,7 +395,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -412,7 +412,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -482,7 +482,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -497,7 +497,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -514,7 +514,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -554,7 +554,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -569,7 +569,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -587,7 +587,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -664,7 +664,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -679,7 +679,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -696,7 +696,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -736,7 +736,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -751,7 +751,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -769,7 +769,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -809,7 +809,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -824,7 +824,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -842,7 +842,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            $:PARALLEL_LOOP(collapse=5)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -949,7 +949,7 @@ contains
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
-                    $:UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
                     call s_write_lag_particles(mytime)
                 end if
                 call s_write_void_evol(mytime)
@@ -982,7 +982,7 @@ contains
             q_prim_vf, &
             idwint)
 
-        $:PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -1006,7 +1006,7 @@ contains
             call s_mpi_allreduce_min(dt_local, dt)
         end if
 
-        $:UPDATE(device=["dt"])
+        $:GPU_UPDATE(device=["dt"])
 
     end subroutine s_compute_dt
 
@@ -1025,7 +1025,7 @@ contains
         call nvtxStartRange("RHS-BODYFORCES")
         call s_compute_body_forces_rhs(q_prim_vf, q_cons_vf, rhs_vf)
 
-        $:PARALLEL_LOOP(collapse=4)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
@@ -1051,7 +1051,7 @@ contains
         integer :: i !< Generic loop iterator
 
         do i = 1, sys_size
-            $:UPDATE(host=["q_prim_vf(i)%sf"])
+            $:GPU_UPDATE(host=["q_prim_vf(i)%sf"])
         end do
 
         if (t_step == t_step_start) then
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 2cc8797e27..ba41303c22 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -25,10 +25,10 @@ module m_viscous
 
     type(int_bounds_info) :: iv
     type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
-    $:DECLARE(create=["is1_viscous","is2_viscous","is3_viscous","iv"])
+    $:GPU_DECLARE(create=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
     real(wp), allocatable, dimension(:, :) :: Res_viscous
-    $:DECLARE(create=["Res_viscous"])
+    $:GPU_DECLARE(create=["Res_viscous"])
 
 contains
 
@@ -43,7 +43,7 @@ contains
                 Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
             end do
         end do
-        $:UPDATE(device=["Res_viscous","Re_idx","Re_size"])
+        $:GPU_UPDATE(device=["Res_viscous","Re_idx","Re_size"])
         !$acc enter data copyin(is1_viscous, is2_viscous, is3_viscous, iv)
 
     end subroutine s_initialize_viscous_module
@@ -76,13 +76,13 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+        $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
-                    $:LOOP()
+                    $:GPU_LOOP()
                     do i = momxb, E_idx
                         tau_Re_vf(i)%sf(j, k, l) = 0._wp
                     end do
@@ -90,13 +90,13 @@ contains
             end do
         end do
         if (shear_stress) then    ! Shear stresses
-            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -112,14 +112,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -138,7 +138,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -149,7 +149,7 @@ contains
 
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -157,12 +157,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -182,7 +182,7 @@ contains
                                         - 2._wp*grad_x_vf(1)%sf(j, k, l) &
                                         - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
                                        (3._wp*Re_visc(1))
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, 2
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -198,13 +198,13 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -220,14 +220,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -246,7 +246,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -257,7 +257,7 @@ contains
 
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -265,12 +265,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -303,13 +303,13 @@ contains
         if (p == 0) return
 
         if (shear_stress) then    ! Shear stresses
-            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -325,14 +325,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -351,7 +351,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -362,7 +362,7 @@ contains
 
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -370,12 +370,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -395,7 +395,7 @@ contains
                                         y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
                                        Re_visc(1)
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 2, 3
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -412,13 +412,13 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
+            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
                 "alpha_rho_visc", "Re_visc", "tau_Re"])
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -434,14 +434,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -460,7 +460,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -471,7 +471,7 @@ contains
 
                             end if
 
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -479,12 +479,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:LOOP()
+                                    $:GPU_LOOP()
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -550,7 +550,7 @@ contains
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
 
-            $:UPDATE(device=["iv"])
+            $:GPU_UPDATE(device=["iv"])
 
             call s_reconstruct_cell_boundary_values_visc( &
                 q_prim_qp%vf(iv%beg:iv%end), &
@@ -588,17 +588,17 @@ contains
         else ! Compute velocity gradient at cell centers using finite differences
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
-            $:UPDATE(device=["iv"])
+            $:GPU_UPDATE(device=["iv"])
 
             is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-            $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+            $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = iv%beg, iv%end
                             dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j, k, l) - &
@@ -609,11 +609,11 @@ contains
                 end do
             end do
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = iv%beg, iv%end
                             dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
@@ -626,11 +626,11 @@ contains
 
             if (n > 0) then
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j, l) - &
@@ -641,11 +641,11 @@ contains
                     end do
                 end do
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
@@ -656,11 +656,11 @@ contains
                     end do
                 end do
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
@@ -675,11 +675,11 @@ contains
                     end do
                 end do
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
@@ -695,11 +695,11 @@ contains
                     end do
                 end do
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
@@ -715,11 +715,11 @@ contains
                     end do
                 end do
 
-                $:PARALLEL_LOOP(collapse=3)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
-                            $:LOOP()
+                            $:GPU_LOOP()
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
@@ -737,11 +737,11 @@ contains
 
                 if (p > 0) then
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -753,11 +753,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -769,11 +769,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -790,11 +790,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -811,11 +811,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -832,11 +832,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -853,11 +853,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -874,11 +874,11 @@ contains
                         end do
                     end do
 
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -894,11 +894,11 @@ contains
                             end do
                         end do
                     end do
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
@@ -914,11 +914,11 @@ contains
                             end do
                         end do
                     end do
-                    $:PARALLEL_LOOP(collapse=3)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = iv%beg, iv%end
                                     dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
                                         (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
@@ -1024,7 +1024,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1036,7 +1036,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1048,7 +1048,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1123,7 +1123,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1135,7 +1135,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1147,7 +1147,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    $:PARALLEL_LOOP(collapse=4)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1211,11 +1211,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(j)) &
@@ -1239,11 +1239,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(k)) &
@@ -1267,11 +1267,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
-                        $:LOOP()
+                        $:GPU_LOOP()
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(l)) &
@@ -1320,9 +1320,9 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        $:UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+        $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
 
-        $:PARALLEL_LOOP(collapse=3)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
@@ -1334,7 +1334,7 @@ contains
         end do
 
         if (n > 0) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1347,7 +1347,7 @@ contains
         end if
 
         if (p > 0) then
-            $:PARALLEL_LOOP(collapse=3)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1359,7 +1359,7 @@ contains
             end do
         end if
 
-        $:PARALLEL_LOOP(collapse=2)
+        $:GPU_PARALLEL_LOOP(collapse=2)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 grad_x%sf(idwbuff(1)%beg, k, l) = &
@@ -1371,7 +1371,7 @@ contains
             end do
         end do
         if (n > 0) then
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
                     grad_y%sf(j, idwbuff(2)%beg, l) = &
@@ -1383,7 +1383,7 @@ contains
                 end do
             end do
             if (p > 0) then
-                $:PARALLEL_LOOP(collapse=2)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_z%sf(j, k, idwbuff(3)%beg) = &
@@ -1398,7 +1398,7 @@ contains
         end if
 
         if (bc_x%beg <= BC_GHOST_EXTRAPOLATION) then
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
@@ -1407,7 +1407,7 @@ contains
             end do
         end if
         if (bc_x%end <= BC_GHOST_EXTRAPOLATION) then
-            $:PARALLEL_LOOP(collapse=2)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
@@ -1417,7 +1417,7 @@ contains
         end if
         if (n > 0) then
             if (bc_y%beg <= BC_GHOST_EXTRAPOLATION .and. bc_y%beg /= BC_NULL) then
-                $:PARALLEL_LOOP(collapse=2)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
@@ -1426,7 +1426,7 @@ contains
                 end do
             end if
             if (bc_y%end <= BC_GHOST_EXTRAPOLATION) then
-                $:PARALLEL_LOOP(collapse=2)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
@@ -1436,7 +1436,7 @@ contains
             end if
             if (p > 0) then
                 if (bc_z%beg <= BC_GHOST_EXTRAPOLATION) then
-                    $:PARALLEL_LOOP(collapse=2)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, 0) = &
@@ -1446,7 +1446,7 @@ contains
                     end do
                 end if
                 if (bc_z%end <= BC_GHOST_EXTRAPOLATION) then
-                    $:PARALLEL_LOOP(collapse=2)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, p) = &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 17e0583687..098a2b1836 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -43,7 +43,7 @@ module m_weno
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: v_rs_ws_x, v_rs_ws_y, v_rs_ws_z
     !> @}
-    $:DECLARE(create=["v_rs_ws_x","v_rs_ws_y","v_rs_ws_z"])
+    $:GPU_DECLARE(create=["v_rs_ws_x","v_rs_ws_y","v_rs_ws_z"])
 
     ! WENO Coefficients
 
@@ -60,8 +60,8 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_y
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_z
     !> @}
-    $:DECLARE(create=["poly_coef_cbL_x","poly_coef_cbL_y","poly_coef_cbL_z"])
-    $:DECLARE(create=["poly_coef_cbR_x","poly_coef_cbR_y","poly_coef_cbR_z"])
+    $:GPU_DECLARE(create=["poly_coef_cbL_x","poly_coef_cbL_y","poly_coef_cbL_z"])
+    $:GPU_DECLARE(create=["poly_coef_cbR_x","poly_coef_cbR_y","poly_coef_cbR_z"])
 
     !> @name The ideal weights at the left and the right cell-boundaries and at the
     !! left and the right quadrature points, in x-, y- and z-directions. Note
@@ -76,7 +76,7 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_y
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_z
     !> @}
-    $:DECLARE(create=["d_cbL_x","d_cbL_y","d_cbL_z","d_cbR_x","d_cbR_y","d_cbR_z"])
+    $:GPU_DECLARE(create=["d_cbL_x","d_cbL_y","d_cbL_z","d_cbR_x","d_cbR_y","d_cbR_z"])
 
     !> @name Smoothness indicator coefficients in the x-, y-, and z-directions. Note
     !! that the first array dimension identifies the smoothness indicator, the
@@ -87,22 +87,22 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_y
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_z
     !> @}
-    $:DECLARE(create=["beta_coef_x","beta_coef_y","beta_coef_z"])
+    $:GPU_DECLARE(create=["beta_coef_x","beta_coef_y","beta_coef_z"])
 
     ! END: WENO Coefficients
 
     integer :: v_size !< Number of WENO-reconstructed cell-average variables
-    $:DECLARE(create=["v_size"])
+    $:GPU_DECLARE(create=["v_size"])
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
-    $:DECLARE(create=["is1_weno","is2_weno","is3_weno"])
+    $:GPU_DECLARE(create=["is1_weno","is2_weno","is3_weno"])
     !
     !> @}
 
     real(wp) :: test
-    $:DECLARE(create=["test"])
+    $:GPU_DECLARE(create=["test"])
 
 contains
 
@@ -625,11 +625,11 @@ contains
         #:endfor
 
         if (weno_dir == 1) then
-            $:UPDATE(device=["poly_coef_cbL_x","poly_coef_cbR_x","d_cbL_x","d_cbR_x","beta_coef_x"])
+            $:GPU_UPDATE(device=["poly_coef_cbL_x","poly_coef_cbR_x","d_cbL_x","d_cbR_x","beta_coef_x"])
         elseif (weno_dir == 2) then
-            $:UPDATE(device=["poly_coef_cbL_y","poly_coef_cbR_y","d_cbL_y","d_cbR_y","beta_coef_y"])
+            $:GPU_UPDATE(device=["poly_coef_cbL_y","poly_coef_cbR_y","d_cbL_y","d_cbR_y","beta_coef_y"])
         else
-            $:UPDATE(device=["poly_coef_cbL_z","poly_coef_cbR_z","d_cbL_z","d_cbR_z","beta_coef_z"])
+            $:GPU_UPDATE(device=["poly_coef_cbL_z","poly_coef_cbR_z","d_cbL_z","d_cbR_z","beta_coef_z"])
         end if
 
         ! Nullifying WENO coefficients and cell-boundary locations pointers
@@ -663,7 +663,7 @@ contains
         is2_weno = is2_weno_d
         is3_weno = is3_weno_d
 
-        $:UPDATE(device=["is1_weno","is2_weno","is3_weno"])
+        $:GPU_UPDATE(device=["is1_weno","is2_weno","is3_weno"])
 
         if (weno_order /= 1) then
             call s_initialize_weno(v_vf, &
@@ -672,7 +672,7 @@ contains
 
         if (weno_order == 1) then
             if (weno_dir == 1) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -685,7 +685,7 @@ contains
                 end do
                 !$acc end parallel loop
             else if (weno_dir == 2) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -698,7 +698,7 @@ contains
                 end do
                 !$acc end parallel loop
             else if (weno_dir == 3) then
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -714,7 +714,7 @@ contains
         elseif (weno_order == 3) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:PARALLEL_LOOP(collapse=4,private=["beta","dvd","poly","omega","alpha","tau"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private=["beta","dvd","poly","omega","alpha","tau"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -793,11 +793,11 @@ contains
         elseif (weno_order == 5) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:PARALLEL_LOOP(collapse=3,private=["dvd","poly","beta","alpha","omega","tau","delta"])
+                    $:GPU_PARALLEL_LOOP(collapse=3,private=["dvd","poly","beta","alpha","omega","tau","delta"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
@@ -913,11 +913,11 @@ contains
         elseif (weno_order == 7) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:PARALLEL_LOOP(collapse=3,private=["poly","beta","alpha","omega","tau","delta","dvd","v"])
+                    $:GPU_PARALLEL_LOOP(collapse=3,private=["poly","beta","alpha","omega","tau","delta","dvd","v"])
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                $:LOOP()
+                                $:GPU_LOOP()
                                 do i = 1, v_size
 
                                     if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity
@@ -1133,10 +1133,10 @@ contains
         ! as to reshape the inputted data in the coordinate direction of
         ! the WENO reconstruction
         v_size = ubound(v_vf, 1)
-        $:UPDATE(device=["v_size"])
+        $:GPU_UPDATE(device=["v_size"])
 
         if (weno_dir == 1) then
-            $:PARALLEL_LOOP(collapse=4)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do j = 1, v_size
                 do q = is3_weno%beg, is3_weno%end
                     do l = is2_weno%beg, is2_weno%end
@@ -1174,7 +1174,7 @@ contains
                 end if
             else
 #endif
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1204,7 +1204,7 @@ contains
                 end block
             else
 #endif
-                $:PARALLEL_LOOP(collapse=4)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1267,7 +1267,7 @@ contains
         real(wp), parameter :: alpha_mp = 2._wp
         real(wp), parameter :: beta_mp = 4._wp/3._wp
 
-        $:PARALLEL_LOOP(collapse=4,private=["d"])
+        $:GPU_PARALLEL_LOOP(collapse=4,private=["d"])
         do l = is3_weno%beg, is3_weno%end
             do k = is2_weno%beg, is2_weno%end
                 do j = is1_weno%beg, is1_weno%end

From 444c4d6fd37f5a633fe87752476d83ee0a595d77 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 19:34:14 -0400
Subject: [PATCH 42/75] Replace enter data directives and add cray inline
 support

---
 src/common/include/directive_macros.fpp | 17 +++++++++++++--
 src/common/m_variables_conversion.fpp   |  2 +-
 src/simulation/m_fftw.fpp               |  2 +-
 src/simulation/m_global_parameters.fpp  | 15 ++++++++-----
 src/simulation/m_ibm.fpp                |  4 ++--
 src/simulation/m_qbmm.fpp               |  2 +-
 src/simulation/m_rhs.fpp                | 28 ++++++++++++-------------
 src/simulation/m_riemann_solvers.fpp    |  2 +-
 src/simulation/m_viscous.fpp            |  2 +-
 9 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/src/common/include/directive_macros.fpp b/src/common/include/directive_macros.fpp
index 0ca82b305c..de4f20cafe 100644
--- a/src/common/include/directive_macros.fpp
+++ b/src/common/include/directive_macros.fpp
@@ -215,7 +215,8 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_ROUTINE(parallelism=['seq'], nohost=False, extraAccArgs=None)
+#:def GPU_ROUTINE(function_name=None, parallelism=['seq'], nohost=False, cray_inline=False, extraAccArgs=None)
+    #:assert isinstance(cray_inline, bool)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
     #:assert isinstance(nohost, bool)
@@ -230,7 +231,19 @@
     #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
     #:set acc_directive = '!$acc routine ' + &
         & clause_val + extraAccArgs_val.strip('\n')
-    $:acc_directive
+    #:if cray_inline == True
+        #:if not isinstance(function_name, str)
+            #:stop "When inlining for Cray Compiler, function name must be given and given as a string"
+        #:endif
+#ifdef _CRAYFTN
+        #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
+        $:cray_directive
+#else
+        $:acc_directive
+#endif
+    #:else
+        $:acc_directive
+    #:endif
 #:enddef
 
 #:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index f7d5e9075a..eb18cdd417 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -611,7 +611,7 @@ contains
 
         integer :: i, j
 
-!$acc enter data copyin(is1b, is1e, is2b, is2e, is3b, is3e)
+$:GPU_ENTER_DATA(copyin=["is1b","is1e","is2b","is2e","is3b","is3e"])
 
 #ifdef MFC_SIMULATION
         @:ALLOCATE(gammas (1:num_fluids))
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index b0614f880a..e26674edd2 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -90,7 +90,7 @@ contains
         gpu_fft_size(1) = real_size; 
         iembed(1) = 0
         oembed(1) = 0
-        !$acc enter data copyin(real_size, cmplx_size, x_size, sys_size, batch_size, Nfq)
+        $:GPU_ENTER_DATA(copyin=["real_size","cmplx_size","x_size","sys_size","batch_size","Nfq"])
         $:GPU_UPDATE(device=["real_size","cmplx_size","x_size","sys_size","batch_size"])
 #else
         ! Allocate input and output DFT data sizes
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index a91b2d49b6..43674e61e0 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -1257,11 +1257,16 @@ contains
             $:GPU_UPDATE(device=["mhd", "relativity"])
         #:endif
 
-        !$acc enter data copyin(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma)
-        !$acc enter data copyin(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN, mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
-        !$acc enter data copyin(dir_idx, dir_flg, dir_idx_tau)
-
-        !$acc enter data copyin(relax, relax_model, palpha_eps,ptgalpha_eps)
+        $:GPU_ENTER_DATA(copyin=["nb","R0ref","Ca","Web","Re_inv","weight","R0", &
+            & "V0","bubbles_euler","polytropic","polydisperse","qbmm","R0_type", &
+            & "ptil","bubble_model","thermal","poly_sigma"])
+        $:GPU_ENTER_DATA(copyin=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv", &
+            & "M_n","M_v","k_n","k_v","pb0","mass_n0","mass_v0","Pe_T", &
+            & "Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN", &
+            & "mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
+        $:GPU_ENTER_DATA(copyin=["dir_idx","dir_flg","dir_idx_tau"])
+
+        $:GPU_ENTER_DATA(copyin=["relax","relax_model","palpha_eps","ptgalpha_eps"])
 
         ! Allocating grid variables for the x-, y- and z-directions
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index bc9a9e401e..b1d47a65e3 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -71,7 +71,7 @@ contains
         @:ACC_SETUP_SFs(levelset)
         @:ACC_SETUP_SFs(levelset_norm)
 
-        !$acc enter data copyin(num_gps, num_inner_gps)
+        $:GPU_ENTER_DATA(copyin=["num_gps","num_inner_gps"])
 
     end subroutine s_initialize_ibm_module
 
@@ -96,7 +96,7 @@ contains
         @:ALLOCATE(ghost_points(1:num_gps))
         @:ALLOCATE(inner_points(1:num_inner_gps))
 
-        !$acc enter data copyin(ghost_points, inner_points)
+        $:GPU_ENTER_DATA(copyin=["ghost_points","inner_points"])
 
         call s_find_ghost_points(ghost_points, inner_points)
         $:GPU_UPDATE(device=["ghost_points", "inner_points"])
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 86ef374e1c..08476068b2 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -58,7 +58,7 @@ contains
                 nterms = 7
             end if
 
-            !$acc enter data copyin(nterms)
+            $:GPU_ENTER_DATA(copyin=["nterms"])
             $:GPU_UPDATE(device=["nterms"])
 
         #:endif
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index d87ac93811..dfba8ef638 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -174,7 +174,7 @@ contains
 
         integer :: num_eqns_after_adv
 
-        !$acc enter data copyin(idwbuff, idwbuff)
+        $:GPU_ENTER_DATA(copyin=["idwbuff","idwbuff"])
         $:GPU_UPDATE(device=["idwbuff", "idwbuff"])
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
@@ -202,29 +202,29 @@ contains
                 @:ALLOCATE(q_prim_qp%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
             else
                 q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-                !$acc enter data copyin(q_prim_qp%vf(l)%sf)
-                !$acc enter data attach(q_prim_qp%vf(l)%sf)
+                $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(l)%sf"])
+                $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(l)%sf"])
             end if
         end do
 
         do l = adv_idx%beg, adv_idx%end
             q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-            !$acc enter data copyin(q_prim_qp%vf(l)%sf)
-            !$acc enter data attach(q_prim_qp%vf(l)%sf)
+            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(l)%sf"])
+            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(l)%sf"])
         end do
 
         if (surface_tension) then
             q_prim_qp%vf(c_idx)%sf => &
                 q_cons_qp%vf(c_idx)%sf
-            !$acc enter data copyin(q_prim_qp%vf(c_idx)%sf)
-            !$acc enter data attach(q_prim_qp%vf(c_idx)%sf)
+            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(c_idx)%sf"])
+            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(c_idx)%sf"])
         end if
 
         if (cont_damage) then
             q_prim_qp%vf(damage_idx)%sf => &
                 q_cons_qp%vf(damage_idx)%sf
-            !$acc enter data copyin(q_prim_qp%vf(damage_idx)%sf)
-            !$acc enter data attach(q_prim_qp%vf(damage_idx)%sf)
+            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(damage_idx)%sf"])
+            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(damage_idx)%sf"])
         end if
 
         if (viscous) then
@@ -549,14 +549,14 @@ contains
                 if (riemann_solver /= 1 .and. riemann_solver /= 4) then
                     do l = adv_idx%beg + 1, adv_idx%end
                         flux_src_n(i)%vf(l)%sf => flux_src_n(i)%vf(adv_idx%beg)%sf
-                        !$acc enter data attach(flux_src_n(i)%vf(l)%sf)
+                        $:GPU_ENTER_DATA(attach=["flux_src_n(i)%vf(l)%sf"])
                     end do
                 end if
             else
                 do l = 1, sys_size
                     flux_n(i)%vf(l)%sf => flux_n(1)%vf(l)%sf
                     flux_src_n(i)%vf(l)%sf => flux_src_n(1)%vf(l)%sf
-                    !$acc enter data attach(flux_n(i)%vf(l)%sf,flux_src_n(i)%vf(l)%sf)
+                    $:GPU_ENTER_DATA(attach=["flux_n(i)%vf(l)%sf","flux_src_n(i)%vf(l)%sf"])
                 end do
             end if
         end do
@@ -1788,13 +1788,13 @@ contains
                 @:DEALLOCATE(q_cons_qp%vf(j)%sf)
                 @:DEALLOCATE(q_prim_qp%vf(j)%sf)
             else
-                !$acc exit data detach(q_prim_qp%vf(j)%sf)
+                $:GPU_EXIT_DATA(detach=["q_prim_qp%vf(j)%sf"])
                 nullify (q_prim_qp%vf(j)%sf)
             end if
         end do
 
         do j = adv_idx%beg, adv_idx%end
-            !$acc exit data detach(q_prim_qp%vf(j)%sf)
+            $:GPU_EXIT_DATA(detach=["q_prim_qp%vf(j)%sf"])
             nullify (q_prim_qp%vf(j)%sf)
         end do
 
@@ -1827,7 +1827,7 @@ contains
         end if
 
         if (mpp_lim .and. bubbles_euler) then
-            !$acc exit data delete(alf_sum%sf)
+            $:GPU_EXIT_DATA(delete=["alf_sum%sf"])
             deallocate (alf_sum%sf)
         end if
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index d81659c2b4..1140834845 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -3338,7 +3338,7 @@ contains
             $:GPU_UPDATE(device=["Res","Re_idx","Re_size"])
         end if
 
-        !$acc enter data copyin(is1, is2, is3, isx, isy, isz)
+        $:GPU_ENTER_DATA(copyin=["is1","is2","is3","isx","isy","isz"])
 
         is1%beg = -1; is2%beg = 0; is3%beg = 0
         is1%end = m; is2%end = n; is3%end = p
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index ba41303c22..d2c6fedbdd 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -44,7 +44,7 @@ contains
             end do
         end do
         $:GPU_UPDATE(device=["Res_viscous","Re_idx","Re_size"])
-        !$acc enter data copyin(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_ENTER_DATA(copyin=["is1_viscous","is2_viscous","is3_viscous","iv"])
 
     end subroutine s_initialize_viscous_module
 

From 593ed0f2597c582f10a383d67a384ccf533cd464 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 19:45:01 -0400
Subject: [PATCH 43/75] Renamed gpu macros file to parallel_macros.fpp

---
 .../include/{directive_macros.fpp => parallel_macros.fpp}       | 0
 src/common/m_boundary_common.fpp                                | 2 +-
 src/common/m_chemistry.fpp                                      | 2 +-
 src/common/m_finite_differences.fpp                             | 2 +-
 src/common/m_helper.fpp                                         | 2 +-
 src/common/m_helper_basic.fpp                                   | 2 +-
 src/common/m_mpi_common.fpp                                     | 2 +-
 src/common/m_phase_change.fpp                                   | 2 +-
 src/common/m_variables_conversion.fpp                           | 2 +-
 src/pre_process/m_assign_variables.fpp                          | 2 +-
 src/pre_process/m_patches.fpp                                   | 2 +-
 src/simulation/m_acoustic_src.fpp                               | 2 +-
 src/simulation/m_body_forces.fpp                                | 2 +-
 src/simulation/m_boundary_conditions.fpp                        | 2 +-
 src/simulation/m_bubbles.fpp                                    | 2 +-
 src/simulation/m_bubbles_EE.fpp                                 | 2 +-
 src/simulation/m_bubbles_EL.fpp                                 | 2 +-
 src/simulation/m_bubbles_EL_kernels.fpp                         | 2 +-
 src/simulation/m_cbc.fpp                                        | 2 +-
 src/simulation/m_compute_cbc.fpp                                | 2 +-
 src/simulation/m_data_output.fpp                                | 2 +-
 src/simulation/m_fftw.fpp                                       | 2 +-
 src/simulation/m_global_parameters.fpp                          | 2 +-
 src/simulation/m_hyperelastic.fpp                               | 2 +-
 src/simulation/m_hypoelastic.fpp                                | 2 +-
 src/simulation/m_ibm.fpp                                        | 2 +-
 src/simulation/m_mhd.fpp                                        | 2 +-
 src/simulation/m_mpi_proxy.fpp                                  | 2 +-
 src/simulation/m_qbmm.fpp                                       | 2 +-
 src/simulation/m_rhs.fpp                                        | 2 +-
 src/simulation/m_riemann_solvers.fpp                            | 2 +-
 src/simulation/m_sim_helpers.fpp                                | 2 +-
 src/simulation/m_start_up.fpp                                   | 2 +-
 src/simulation/m_surface_tension.fpp                            | 2 +-
 src/simulation/m_time_steppers.fpp                              | 2 +-
 src/simulation/m_viscous.fpp                                    | 2 +-
 src/simulation/m_weno.fpp                                       | 2 +-
 37 files changed, 36 insertions(+), 36 deletions(-)
 rename src/common/include/{directive_macros.fpp => parallel_macros.fpp} (100%)

diff --git a/src/common/include/directive_macros.fpp b/src/common/include/parallel_macros.fpp
similarity index 100%
rename from src/common/include/directive_macros.fpp
rename to src/common/include/parallel_macros.fpp
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 5f9f8fa16e..ad0f2d81ff 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -6,7 +6,7 @@
 !! boundary condiitons
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_boundary_common
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 4984a2248f..a094fe4651 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -4,7 +4,7 @@
 !! @author Henry Le Berre <hberre3@gatech.edu>
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 #:include 'case.fpp'
 
 module m_chemistry
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 8119c1257a..7b7ca73029 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -1,5 +1,5 @@
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_finite_differences
 
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index cbcabc0a3f..83ac14675d 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -1,5 +1,5 @@
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 !>
 !! @file m_helper.f90
 !! @brief Contains module m_helper
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index b6afc36416..9b2d4007af 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -1,4 +1,4 @@
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 !>
 !! @file m_helper_basic.f90
 !! @brief Contains module m_helper_basic
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 6ea2bc60f2..b7a5c2368c 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -1,6 +1,6 @@
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 15f11be298..baf4a7601c 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -2,7 +2,7 @@
 !> procedure.
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_phase_change
 
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index eb18cdd417..8db15cc5c9 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_variables_conversion
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 #:include 'case.fpp'
 
 !> @brief This module consists of subroutines used in the conversion of the
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index 0f77df808d..fe1a089fbb 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_assign_variables
 
 #:include 'case.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_assign_variables
 
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index 2dfefe6623..a9950a2ab9 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -7,7 +7,7 @@
 #:include '2dHardcodedIC.fpp'
 #:include '3dHardcodedIC.fpp'
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_patches
 
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 3e1dea3e5e..3b6d1f6ad2 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_acoustic_src
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to create a acoustic source pressure source term
 module m_acoustic_src
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index d761141a0d..176c5e6fe9 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -1,5 +1,5 @@
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_body_forces
 
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index f7408c43de..825fff5daf 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -2,7 +2,7 @@
 ! @brief Contains module m_boundary_conditions
 
 !> @brief This module contains
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 module m_boundary_conditions
 
     use m_derived_types
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 3d9341ef0a..c25db8b892 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_bubbles
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module contains the procedures shared by the ensemble-averaged and volume-averaged bubble models.
 module m_bubbles
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index c522af5e27..fc1bb62580 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_bubbles_EE
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute the ensemble-averaged bubble dynamic variables
 module m_bubbles_EE
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 3240a9d9a8..4ab4a904da 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_bubbles_EL
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to to compute the volume-averaged bubble model
 module m_bubbles_EL
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index aa660b2ed0..f6f9c70d4f 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_bubbles_EL_kernels
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module contains kernel functions used to map the effect of the lagrangian bubbles
 !!        in the Eulerian framework.
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 3c587d9735..27a918f7cd 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -19,7 +19,7 @@
 !!              Please refer to Thompson (1987, 1990) for detailed descriptions.
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 module m_cbc
 
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index d397d19cf5..51e404777e 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -1,4 +1,4 @@
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 !>
 !! @file m_compute_cbc.f90
 !! @brief CBC computation module
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index b1922c08fa..34965ea09e 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_data_output
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The primary purpose of this module is to output the grid and the
 !!              conservative variables data at the chosen time-step interval. In
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index e26674edd2..24b8bb6188 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_fftw
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines for the FFT routines
 module m_fftw
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 43674e61e0..576d18ad15 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -4,7 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module contains all of the parameters describing the program
 !!              logistics, the computational domain and the simulation algorithm.
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index dd232a7cd8..a00a7c808e 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_hyperelastic
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module consists of subroutines used in the calculation
 !!              of the cauchy tensor
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 73f6383f69..e4105dac8c 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_hypoelastic
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute source terms for hypoelastic model
 module m_hypoelastic
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index b1d47a65e3..10a9a25349 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_ibm
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to handle all operations related to immersed
 !!              boundary methods (IBMs)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 88f8621dbd..669363305d 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_mhd
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute source terms for magnetohydrodynamics
 !!              Note: not applicable for 1D
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index c8e943e8d9..d87c5df02c 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -4,7 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 08476068b2..3521a71c00 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_qbmm
 
 #:include 'case.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 #:include 'macros.fpp'
 
 !> @brief This module is used to compute moment inversion via qbmm
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index dfba8ef638..7f73510a31 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -4,7 +4,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to calculate the right-
 !!              hane-side (RHS) in the quasi-conservative, shock- and interface-
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 1140834845..706bb37730 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -21,7 +21,7 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 #:include 'inline_riemann.fpp'
 
 module m_riemann_solvers
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 2ff31ea283..da0aa370cb 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -1,4 +1,4 @@
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 module m_sim_helpers
 
     use m_derived_types        !< Definitions of the derived types
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 981c9f8020..a8134173ca 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_start_up
 
 #:include 'case.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The purpose of the module is primarily to read in the files that
 !!              contain the inputs, the initial condition data and the grid data
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 4433fc9926..a29065c6da 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -1,5 +1,5 @@
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 #:include 'inline_capillary.fpp'
 
 !> @brief This module is used to compute source terms for surface tension model
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4d9e28e6a5..344b27d2c7 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_time_steppers
 
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The following module features a variety of time-stepping schemes.
 !!              Currently, it includes the following Runge-Kutta (RK) algorithms:
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index d2c6fedbdd..77f939fb8b 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -2,7 +2,7 @@
 !! @file m_viscous.f90
 !! @brief Contains module m_viscous
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to compute viscous terms.
 module m_viscous
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 098a2b1836..7ff75df03d 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -2,7 +2,7 @@
 !! @file m_weno.f90
 !! @brief Contains module m_weno
 #:include 'macros.fpp'
-#:include 'directive_macros.fpp'
+#:include 'parallel_macros.fpp'
 
 !> @brief  Weighted essentially non-oscillatory (WENO) reconstruction scheme
 !!              that is supplemented with monotonicity preserving bounds (MPWENO)

From 7d378bee2a0fe272fb8c0709bc915c392b9e4654 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 20:10:04 -0400
Subject: [PATCH 44/75] Ran formatter

---
 src/common/m_variables_conversion.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 8db15cc5c9..93b1c3e435 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -611,7 +611,7 @@ contains
 
         integer :: i, j
 
-$:GPU_ENTER_DATA(copyin=["is1b","is1e","is2b","is2e","is3b","is3e"])
+        $:GPU_ENTER_DATA(copyin=["is1b","is1e","is2b","is2e","is3b","is3e"])
 
 #ifdef MFC_SIMULATION
         @:ALLOCATE(gammas (1:num_fluids))

From a8950143d892e54d779b025f5235375d5f93d937 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 20:29:34 -0400
Subject: [PATCH 45/75] Undo changes mpi_common as will be refactored in future
 pull request

---
 src/common/m_mpi_common.fpp | 69 ++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index b7a5c2368c..609382d8a2 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -1,6 +1,5 @@
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
@@ -26,7 +25,7 @@ module m_mpi_common
     implicit none
 
     integer, private :: err_code, ierr, v_size !<
-    $:GPU_DECLARE(create=["v_size"])
+    !$acc declare create(v_size)
     !! Generic flags used to identify and report MPI errors
 
     real(wp), private, allocatable, dimension(:), target :: buff_send !<
@@ -39,10 +38,10 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    $:GPU_DECLARE(create=["buff_send","buff_recv"])
+    !$acc declare create(buff_send, buff_recv)
 
     integer :: halo_size, nVars
-    $:GPU_DECLARE(create=["halo_size","nVars"])
+    !$acc declare create(halo_size, nVars)
 
 contains
 
@@ -639,7 +638,7 @@ contains
 #ifdef MFC_MPI
 
         call nvtxStartRange("RHS-COMM-PACKBUF")
-        $:GPU_UPDATE(device=["v_size"])
+!$acc update device(v_size)
 
 #ifdef MFC_SIMULATION
         if (qbmm .and. .not. polytropic) then
@@ -693,7 +692,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -707,7 +706,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                        !$acc parallel loop collapse(4) gang vector default(present) private(r)
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -722,7 +721,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -739,7 +738,7 @@ contains
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, sys_size
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -755,7 +754,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -771,7 +770,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -789,7 +788,7 @@ contains
                     end if
 #endif
                 #:else
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, sys_size
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -805,7 +804,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -821,7 +820,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -856,7 +855,7 @@ contains
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    $:GPU_UPDATE(host=["buff_send"])
+                    !$acc update host(buff_send)
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
@@ -874,7 +873,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    $:GPU_UPDATE(device=["buff_recv"])
+                    !$acc update device(buff_recv)
                     call nvtxEndRange
                 #:endif
             end if
@@ -891,7 +890,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -912,7 +911,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -927,7 +926,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -944,7 +943,7 @@ contains
                     end if
 #endif
                 #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, sys_size
                         do l = 0, p
                             do k = -buff_size, -1
@@ -966,7 +965,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -982,7 +981,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -1001,7 +1000,7 @@ contains
 #endif
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, sys_size
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -1024,7 +1023,7 @@ contains
 
 #ifdef MFC_SIMULATION
                     if (qbmm .and. .not. polytropic) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1041,7 +1040,7 @@ contains
                             end do
                         end do
 
-                        $:GPU_PARALLEL_LOOP(collapse=5,private=["r"])
+                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
                         do i = sys_size + 1, sys_size + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -1089,7 +1088,7 @@ contains
 #ifdef MFC_MPI
 
         nVars = num_dims + 1
-        $:GPU_UPDATE(device=["nVars"])
+        !$acc update device(nVars)
 
         buffer_counts = (/ &
                         buff_size*nVars*(n + 1)*(p + 1), &
@@ -1131,7 +1130,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -1144,7 +1143,7 @@ contains
                     end do
 
                 #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, nVars
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -1159,7 +1158,7 @@ contains
                     end do
 
                 #:else
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, nVars
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -1187,7 +1186,7 @@ contains
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    $:GPU_UPDATE(host=["buff_send"])
+                    !$acc update host(buff_send)
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
@@ -1205,7 +1204,7 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    $:GPU_UPDATE(device=["buff_recv"])
+                    !$acc update device(buff_recv)
                     call nvtxEndRange
                 #:endif
             end if
@@ -1221,7 +1220,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -1241,7 +1240,7 @@ contains
                     end do
 
                 #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, nVars
                         do l = 0, p
                             do k = -buff_size, -1
@@ -1263,7 +1262,7 @@ contains
 
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["r"])
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
                     do i = 1, nVars
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -1301,4 +1300,4 @@ contains
 
     end subroutine s_finalize_mpi_common_module
 
-end module m_mpi_common
+end module m_mpi_common
\ No newline at end of file

From f36163f915be9e20ef5f19cd8f2f596630595bd5 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 12 Jun 2025 20:32:24 -0400
Subject: [PATCH 46/75] Ran formatter

---
 src/common/m_mpi_common.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 609382d8a2..b439ce1c6d 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -1300,4 +1300,4 @@ contains
 
     end subroutine s_finalize_mpi_common_module
 
-end module m_mpi_common
\ No newline at end of file
+end module m_mpi_common

From 4c9f230e49247be1f8e16c3461a40c8750b25534 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Mon, 16 Jun 2025 11:32:18 -0400
Subject: [PATCH 47/75] Moved parallel_macros include directly into macros.fpp

---
 src/common/include/macros.fpp            | 2 ++
 src/common/m_boundary_common.fpp         | 1 -
 src/common/m_chemistry.fpp               | 1 -
 src/common/m_finite_differences.fpp      | 1 -
 src/common/m_helper.fpp                  | 2 +-
 src/common/m_helper_basic.fpp            | 3 ++-
 src/common/m_phase_change.fpp            | 1 -
 src/common/m_variables_conversion.fpp    | 1 -
 src/pre_process/m_assign_variables.fpp   | 2 +-
 src/pre_process/m_patches.fpp            | 1 -
 src/simulation/m_acoustic_src.fpp        | 1 -
 src/simulation/m_body_forces.fpp         | 1 -
 src/simulation/m_boundary_conditions.fpp | 3 ++-
 src/simulation/m_bubbles.fpp             | 1 -
 src/simulation/m_bubbles_EE.fpp          | 1 -
 src/simulation/m_bubbles_EL.fpp          | 1 -
 src/simulation/m_bubbles_EL_kernels.fpp  | 1 -
 src/simulation/m_cbc.fpp                 | 1 -
 src/simulation/m_compute_cbc.fpp         | 3 ++-
 src/simulation/m_data_output.fpp         | 1 -
 src/simulation/m_fftw.fpp                | 1 -
 src/simulation/m_global_parameters.fpp   | 1 -
 src/simulation/m_hyperelastic.fpp        | 1 -
 src/simulation/m_hypoelastic.fpp         | 1 -
 src/simulation/m_ibm.fpp                 | 1 -
 src/simulation/m_mhd.fpp                 | 1 -
 src/simulation/m_mpi_proxy.fpp           | 1 -
 src/simulation/m_qbmm.fpp                | 1 -
 src/simulation/m_rhs.fpp                 | 1 -
 src/simulation/m_riemann_solvers.fpp     | 1 -
 src/simulation/m_sim_helpers.fpp         | 3 ++-
 src/simulation/m_start_up.fpp            | 2 +-
 src/simulation/m_surface_tension.fpp     | 1 -
 src/simulation/m_time_steppers.fpp       | 1 -
 src/simulation/m_viscous.fpp             | 1 -
 src/simulation/m_weno.fpp                | 1 -
 36 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 679c276bf4..3a2bbf0b17 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -1,3 +1,5 @@
+#:include 'parallel_macros.fpp'
+
 #:def LOG(expr)
 #ifdef MFC_DEBUG
     block
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index ad0f2d81ff..ce5b98a3fb 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -6,7 +6,6 @@
 !! boundary condiitons
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_boundary_common
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index a094fe4651..3ff223844f 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -4,7 +4,6 @@
 !! @author Henry Le Berre <hberre3@gatech.edu>
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 #:include 'case.fpp'
 
 module m_chemistry
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 7b7ca73029..c9bf9a4ad2 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -1,5 +1,4 @@
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_finite_differences
 
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 83ac14675d..44299f56d6 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -1,5 +1,5 @@
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
+
 !>
 !! @file m_helper.f90
 !! @brief Contains module m_helper
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 9b2d4007af..2ba77f0f15 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -1,8 +1,9 @@
-#:include 'parallel_macros.fpp'
 !>
 !! @file m_helper_basic.f90
 !! @brief Contains module m_helper_basic
 
+#:include 'macros.fpp'
+
 module m_helper_basic
 
     use m_derived_types        !< Definitions of the derived types
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index baf4a7601c..0a23205d58 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -2,7 +2,6 @@
 !> procedure.
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_phase_change
 
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 93b1c3e435..4bc1426153 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_variables_conversion
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 #:include 'case.fpp'
 
 !> @brief This module consists of subroutines used in the conversion of the
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index fe1a089fbb..0774d43c88 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_assign_variables
 
 #:include 'case.fpp'
-#:include 'parallel_macros.fpp'
+#:include 'macros.fpp'
 
 module m_assign_variables
 
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index a9950a2ab9..f5245d392a 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -7,7 +7,6 @@
 #:include '2dHardcodedIC.fpp'
 #:include '3dHardcodedIC.fpp'
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_patches
 
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 3b6d1f6ad2..2305aff227 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_acoustic_src
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to create a acoustic source pressure source term
 module m_acoustic_src
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 176c5e6fe9..794e7be075 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -1,5 +1,4 @@
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_body_forces
 
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index 825fff5daf..5589ee23aa 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -2,7 +2,8 @@
 ! @brief Contains module m_boundary_conditions
 
 !> @brief This module contains
-#:include 'parallel_macros.fpp'
+#:include 'macros.fpp'
+
 module m_boundary_conditions
 
     use m_derived_types
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 1699cf8251..7cf2107522 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_bubbles
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module contains the procedures shared by the ensemble-averaged and volume-averaged bubble models.
 module m_bubbles
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index fc1bb62580..201a17d6ab 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_bubbles_EE
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute the ensemble-averaged bubble dynamic variables
 module m_bubbles_EE
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 4ab4a904da..0b9933ef8b 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_bubbles_EL
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to to compute the volume-averaged bubble model
 module m_bubbles_EL
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index f6f9c70d4f..d51f33589e 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_bubbles_EL_kernels
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module contains kernel functions used to map the effect of the lagrangian bubbles
 !!        in the Eulerian framework.
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 27a918f7cd..07b859e085 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -19,7 +19,6 @@
 !!              Please refer to Thompson (1987, 1990) for detailed descriptions.
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 module m_cbc
 
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 51e404777e..fbe4b0d6b3 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -1,8 +1,9 @@
-#:include 'parallel_macros.fpp'
 !>
 !! @file m_compute_cbc.f90
 !! @brief CBC computation module
 
+#:include 'macros.fpp'
+
 module m_compute_cbc
     use m_global_parameters
     implicit none
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 34965ea09e..2cec4cbe38 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_data_output
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The primary purpose of this module is to output the grid and the
 !!              conservative variables data at the chosen time-step interval. In
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 24b8bb6188..4646f574a1 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_fftw
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines for the FFT routines
 module m_fftw
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 576d18ad15..b5a174e0e0 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -4,7 +4,6 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module contains all of the parameters describing the program
 !!              logistics, the computational domain and the simulation algorithm.
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index a00a7c808e..1e8b475284 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_hyperelastic
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module consists of subroutines used in the calculation
 !!              of the cauchy tensor
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index e4105dac8c..b2b5d7a859 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_hypoelastic
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute source terms for hypoelastic model
 module m_hypoelastic
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 10a9a25349..a435d17454 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_ibm
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to handle all operations related to immersed
 !!              boundary methods (IBMs)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 669363305d..4f76567423 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_mhd
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief This module is used to compute source terms for magnetohydrodynamics
 !!              Note: not applicable for 1D
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index d87c5df02c..4ec4c463f5 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -4,7 +4,6 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module serves as a proxy to the parameters and subroutines
 !!          available in the MPI implementation's MPI module. Specifically,
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 3521a71c00..c4352d05b3 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_qbmm
 
 #:include 'case.fpp'
-#:include 'parallel_macros.fpp'
 #:include 'macros.fpp'
 
 !> @brief This module is used to compute moment inversion via qbmm
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 7f73510a31..416f9ca5bc 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -4,7 +4,6 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to calculate the right-
 !!              hane-side (RHS) in the quasi-conservative, shock- and interface-
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 706bb37730..45685f4c13 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -21,7 +21,6 @@
 
 #:include 'case.fpp'
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 #:include 'inline_riemann.fpp'
 
 module m_riemann_solvers
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index da0aa370cb..aeb4f3933d 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -1,4 +1,5 @@
-#:include 'parallel_macros.fpp'
+#:include 'macros.fpp'
+
 module m_sim_helpers
 
     use m_derived_types        !< Definitions of the derived types
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index a8134173ca..a89d882b82 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -3,7 +3,7 @@
 !! @brief Contains module m_start_up
 
 #:include 'case.fpp'
-#:include 'parallel_macros.fpp'
+#:include 'macros.fpp'
 
 !> @brief The purpose of the module is primarily to read in the files that
 !!              contain the inputs, the initial condition data and the grid data
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index a29065c6da..f6181237ee 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -1,5 +1,4 @@
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 #:include 'inline_capillary.fpp'
 
 !> @brief This module is used to compute source terms for surface tension model
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 344b27d2c7..658f5abce8 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_time_steppers
 
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The following module features a variety of time-stepping schemes.
 !!              Currently, it includes the following Runge-Kutta (RK) algorithms:
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 77f939fb8b..da65b0d0a0 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -2,7 +2,6 @@
 !! @file m_viscous.f90
 !! @brief Contains module m_viscous
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief The module contains the subroutines used to compute viscous terms.
 module m_viscous
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 7ff75df03d..d965efa377 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -2,7 +2,6 @@
 !! @file m_weno.f90
 !! @brief Contains module m_weno
 #:include 'macros.fpp'
-#:include 'parallel_macros.fpp'
 
 !> @brief  Weighted essentially non-oscillatory (WENO) reconstruction scheme
 !!              that is supplemented with monotonicity preserving bounds (MPWENO)

From 395bc3ed7ce8647fb645e7e61135ad33044d4234 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Mon, 16 Jun 2025 11:37:39 -0400
Subject: [PATCH 48/75] Changed default of routine parallelism argument

---
 src/common/include/parallel_macros.fpp  |  2 +-
 src/common/m_boundary_common.fpp        | 22 ++++++++--------
 src/common/m_helper.fpp                 |  4 +--
 src/common/m_helper_basic.fpp           |  6 ++---
 src/common/m_phase_change.fpp           | 12 ++++-----
 src/common/m_variables_conversion.fpp   | 10 ++++----
 src/pre_process/m_assign_variables.fpp  |  4 +--
 src/pre_process/m_patches.fpp           |  8 +++---
 src/simulation/m_acoustic_src.fpp       |  6 ++---
 src/simulation/m_bubbles.fpp            | 34 ++++++++++++-------------
 src/simulation/m_bubbles_EL.fpp         |  4 +--
 src/simulation/m_bubbles_EL_kernels.fpp | 12 ++++-----
 src/simulation/m_compute_cbc.fpp        | 16 ++++++------
 src/simulation/m_hyperelastic.fpp       |  4 +--
 src/simulation/m_ibm.fpp                |  2 +-
 src/simulation/m_qbmm.fpp               | 14 +++++-----
 src/simulation/m_riemann_solvers.fpp    |  4 +--
 src/simulation/m_sim_helpers.fpp        |  6 ++---
 18 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index de4f20cafe..08f46135a1 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -272,7 +272,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_LOOP(collapse=None, parallelism=["seq"], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index ce5b98a3fb..b521d1272f 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -241,7 +241,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -310,7 +310,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_symmetry
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -574,7 +574,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_periodic
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -717,7 +717,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_axis
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -780,7 +780,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_slip_wall
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -879,7 +879,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_no_slip_wall
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1014,7 +1014,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_dirichlet
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
@@ -1083,7 +1083,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_qbmm_extrapolation
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -1282,7 +1282,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_periodic
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1340,7 +1340,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_reflective
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1422,7 +1422,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 44299f56d6..509d9a3708 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -44,7 +44,7 @@ contains
         !! @param Rtmp is the  bubble radii
         !! @param ntmp is the output number bubble density
     pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out) :: ntmp
@@ -58,7 +58,7 @@ contains
     end subroutine s_comp_n_from_prim
 
     pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out) :: ntmp
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 2ba77f0f15..34c96c5113 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -25,7 +25,7 @@ contains
     !! @param tol_input Relative error (default = 1e-6_wp).
     !! @return Result of the comparison.
     logical pure elemental function f_approx_equal(a, b, tol_input) result(res)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: a, b
         real(wp), optional, intent(in) :: tol_input
         real(wp) :: tol
@@ -48,7 +48,7 @@ contains
     !> Checks if a real(wp) variable is of default value.
     !! @param var Variable to check.
     logical pure elemental function f_is_default(var) result(res)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, dflt_real)
@@ -73,7 +73,7 @@ contains
     !> Checks if a real(wp) variable is an integer.
     !! @param var Variable to check.
     logical pure elemental function f_is_integer(var) result(res)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, real(nint(var), wp))
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 0a23205d58..f4b1815687 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -289,7 +289,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         ! initializing variables
@@ -393,7 +393,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         integer, intent(in) :: j, k, l
@@ -518,7 +518,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_correct_partial_densities
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         !> @name variables for the correction of the reacting partial densities
@@ -581,7 +581,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_jacobian_matrix
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
@@ -688,7 +688,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pTg_residue
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         integer, intent(in) :: j, k, l
@@ -739,7 +739,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_TSat
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(in) :: pSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 4bc1426153..08ce822a83 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -120,7 +120,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pressure
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(in) :: energy, alf
@@ -461,7 +461,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -542,7 +542,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -1603,7 +1603,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_speed_of_sound
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(in) :: pres
@@ -1673,7 +1673,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(in) :: B(3), rho, c
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index 0774d43c88..fa012d0695 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -104,7 +104,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, &
                                                                eta, q_prim_vf, patch_id_fp)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
@@ -277,7 +277,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     impure subroutine s_assign_patch_species_primitive_variables(patch_id, j, k, l, &
                                                                  eta, q_prim_vf, patch_id_fp)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index f5245d392a..edab185958 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -2381,7 +2381,7 @@ contains
     end subroutine s_model
 
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         real(wp), intent(in) :: cyl_y, cyl_z
 
@@ -2392,7 +2392,7 @@ contains
 
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         t_vec3, intent(in) :: cyl
         t_vec3 :: cart
@@ -2404,7 +2404,7 @@ contains
     end function f_convert_cyl_to_cart
 
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         real(wp), intent(IN) :: cyl_x, cyl_y
 
@@ -2417,7 +2417,7 @@ contains
     !! @param offset Thickness
     !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: myth, offset, a
         real(wp) :: b
         real(wp) :: f_r
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 2305aff227..e319c0e963 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -346,7 +346,7 @@ contains
     !! @param gauss_sigma_time_local sigma in time for Gaussian pulse
     !! @param source Source term amplitude
     pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         integer, intent(in) :: ai, term_index
         real(wp), intent(in) :: sim_time, c, sum_BB
         real(wp), intent(in) :: frequency_local, gauss_sigma_time_local
@@ -698,7 +698,7 @@ contains
     !! @param c Speed of sound
     !! @return frequency_local Converted frequency
     pure elemental function f_frequency_local(freq_conv_flag, ai, c)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         logical, intent(in) :: freq_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
@@ -717,7 +717,7 @@ contains
     !! @param ai Acoustic source index
     !! @return gauss_sigma_time_local Converted Gaussian sigma time
     pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         logical, intent(in) :: gauss_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 7cf2107522..02e1932c8e 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -39,7 +39,7 @@ contains
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
     pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(in) :: fCson
@@ -80,7 +80,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw(fR0, fR, fV, fpb)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
         real(wp) :: f_cpbw
@@ -99,7 +99,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
         real(wp) :: tmp1, tmp2, tmp3
@@ -119,7 +119,7 @@ contains
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
     pure elemental function f_cgas(fCpinf, fntait, fBtait, fH)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
         real(wp) :: tmp
@@ -142,7 +142,7 @@ contains
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
     pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
         real(wp) :: c2_liquid
@@ -172,7 +172,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
     pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
 
@@ -208,7 +208,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fCpbw Boundary wall pressure
     pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw
 
         real(wp) :: f_rddot_RP
@@ -231,7 +231,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
 
@@ -254,7 +254,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw_KM(fR0, fR, fV, fpb)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
 
@@ -281,7 +281,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
     pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
 
@@ -315,7 +315,7 @@ contains
         !!  @param pb Internal bubble pressure
         !!  @param iR0 Current bubble size index
     pure elemental subroutine s_bwproperty(pb, iR0, chi_vw, k_mw, rho_mw)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: pb
         integer, intent(in) :: iR0
         real(wp), intent(out) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -345,7 +345,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
         real(wp), intent(in) :: fpb
@@ -403,7 +403,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -466,7 +466,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_step
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
@@ -600,7 +600,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_initial_substep_h
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -686,7 +686,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_substep
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
@@ -784,7 +784,7 @@ contains
         !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
     pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
                                            fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
         real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t
         integer, intent(IN) :: bub_id
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 0b9933ef8b..71706d18f8 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -741,7 +741,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_cson_from_pinf
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
@@ -811,7 +811,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_pinf
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index d51f33589e..fe644244cc 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -203,7 +203,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_applygaussian
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
@@ -273,7 +273,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_check_celloutside
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
@@ -309,7 +309,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
@@ -350,7 +350,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_stddsv
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
@@ -391,7 +391,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_char_vol
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
@@ -416,7 +416,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_cell
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index fbe4b0d6b3..5bdbc6715d 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -88,7 +88,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -106,7 +106,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -136,7 +136,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -153,7 +153,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -177,7 +177,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -199,7 +199,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -221,7 +221,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(sys_size), intent(inout) :: L
         L(1:advxe) = 0._wp
@@ -233,7 +233,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 1e8b475284..ad888c813d 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -220,7 +220,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_neoHookean_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -259,7 +259,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index a435d17454..7adad082ca 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -750,7 +750,7 @@ contains
     !> Function that uses the interpolation coefficients and the current state
     !! at the cell centers in order to estimate the state at the image point
     pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         type(scalar_field), &
             dimension(sys_size), &
             intent(IN) :: q_prim_vf !< Primitive Variables
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index c4352d05b3..40e56ebf9d 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -566,7 +566,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff_nonpoly
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -639,7 +639,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         real(wp), intent(in) :: pres, rho, c
@@ -864,7 +864,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:GPU_ROUTINE()
+            $:GPU_ROUTINE(parallelism=['seq'])
 #endif
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
@@ -880,7 +880,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:GPU_ROUTINE()
+            $:GPU_ROUTINE(parallelism=['seq'])
 #endif
             real(wp), dimension(nmom), intent(in) :: momin
             real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
@@ -940,7 +940,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_hyqmom
 #else
-            $:GPU_ROUTINE()
+            $:GPU_ROUTINE(parallelism=['seq'])
 #endif
             real(wp), dimension(2), intent(inout) :: frho, fup
             real(wp), dimension(3), intent(in) :: fmom
@@ -959,7 +959,7 @@ contains
         end subroutine s_hyqmom
 
         pure function f_quad(abscX, abscY, wght_in, q, r, s)
-            $:GPU_ROUTINE()
+            $:GPU_ROUTINE(parallelism=['seq'])
             real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
             real(wp), intent(in) :: q, r, s
 
@@ -975,7 +975,7 @@ contains
         end function f_quad
 
         pure function f_quad2D(abscX, abscY, wght_in, pow)
-            $:GPU_ROUTINE()
+            $:GPU_ROUTINE(parallelism=['seq'])
             real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
             real(wp), dimension(3), intent(in) :: pow
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 45685f4c13..a55b0c144c 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -4241,7 +4241,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
     pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         implicit none
 
@@ -4275,7 +4275,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
     pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 
         implicit none
 
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index aeb4f3933d..9be0c31c2b 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -35,7 +35,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_enthalpy
 #else
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
 #endif
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
@@ -100,7 +100,7 @@ contains
         !! @param vcfl_sf (optional) cell centered viscous cfl number
         !! @param Rc_sf (optional) cell centered Rc
     pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
@@ -197,7 +197,7 @@ contains
         !! @param k y coordinate
         !! @param l z coordinate
     pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
-        $:GPU_ROUTINE()
+        $:GPU_ROUTINE(parallelism=['seq'])
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt

From c9b8708b0fb9630aa0aa34da1ce667b55b6317dd Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Tue, 17 Jun 2025 14:23:03 -0400
Subject: [PATCH 49/75] Switch to string lists

---
 src/common/include/parallel_macros.fpp   |  77 ++++++----
 src/common/m_boundary_common.fpp         |   2 +-
 src/common/m_chemistry.fpp               |   2 +-
 src/common/m_finite_differences.fpp      |   2 +-
 src/common/m_phase_change.fpp            |  16 +-
 src/common/m_variables_conversion.fpp    |  26 ++--
 src/simulation/m_acoustic_src.fpp        |  46 +++---
 src/simulation/m_body_forces.fpp         |   4 +-
 src/simulation/m_boundary_conditions.fpp |   8 +-
 src/simulation/m_bubbles.fpp             |   2 +-
 src/simulation/m_bubbles_EE.fpp          |  16 +-
 src/simulation/m_bubbles_EL.fpp          |  76 ++++-----
 src/simulation/m_bubbles_EL_kernels.fpp  |   4 +-
 src/simulation/m_cbc.fpp                 |  58 +++----
 src/simulation/m_data_output.fpp         |  14 +-
 src/simulation/m_fftw.fpp                |  16 +-
 src/simulation/m_global_parameters.fpp   | 188 +++++++++++------------
 src/simulation/m_hyperelastic.fpp        |  18 +--
 src/simulation/m_hypoelastic.fpp         |  16 +-
 src/simulation/m_ibm.fpp                 |  42 ++---
 src/simulation/m_mhd.fpp                 |  12 +-
 src/simulation/m_mpi_proxy.fpp           |  74 ++++-----
 src/simulation/m_qbmm.fpp                |  30 ++--
 src/simulation/m_rhs.fpp                 | 164 ++++++++++----------
 src/simulation/m_riemann_solvers.fpp     | 124 +++++++--------
 src/simulation/m_sim_helpers.fpp         |   6 +-
 src/simulation/m_start_up.fpp            |  64 ++++----
 src/simulation/m_surface_tension.fpp     |  26 ++--
 src/simulation/m_time_steppers.fpp       |  10 +-
 src/simulation/m_viscous.fpp             |  34 ++--
 src/simulation/m_weno.fpp                |  34 ++--
 31 files changed, 614 insertions(+), 597 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 08f46135a1..d34b32065f 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -1,3 +1,4 @@
+
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
     #:assert isinstance(data, list)
@@ -5,9 +6,28 @@
     #:assert all(isinstance(element, datatype) for element in data)
 #:enddef
 
-#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_list)
+#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_str)
+    #:set clause_regex = re.compile(',(?![^(]*\\))')
+
     #:assert isinstance(clause_name, str)
-    #:if clause_list is not None
+    #:if clause_str is not None
+        #:set count = 0
+        #:assert isinstance(clause_str, str)
+        #:assert clause_str[0] == '[' and clause_str[-1] == ']'
+        #:for c in clause_str
+            #:if c == '('
+                #:set count = count + 1
+            #:elif c == ')'
+                #:set count = count - 1
+            #:endif
+            #:if count > 1
+                #:stop 'Nested parentheses is not supported. Incorrect clause: {}'.format(clause_str)
+            #:elif count < 0
+                #:stop 'Missing parentheses. Incorrect clause: {}'.format(clause_str)
+            #:endif
+        #:endfor
+        #:set clause_str = re.sub(clause_regex, ';', clause_str)
+        #:set clause_list = [x.strip() for x in clause_str.strip('[]').split(';')]
         $:ASSERT_LIST(clause_list, str)
         #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
     #:else
@@ -33,15 +53,10 @@
 
 #:def GEN_COPYIN_STR(copyin, readonly)
     #:assert isinstance(readonly, bool)
-    #:if copyin is not None
-        $:ASSERT_LIST(copyin, str)
-        #:if readonly == True
-            #:set copyin_val = 'copyin(readonly:' + ', '.join(copyin) + ') '
-        #:else
-            #:set copyin_val = 'copyin(' + ', '.join(copyin) + ') '
-        #:endif
-    #:else
-        #:set copyin_val = ''
+    #:set copyin_val = GEN_PARENTHESES_CLAUSE('copyin', copyin)
+    #:if copyin is not None and readonly == True
+        #:set index = copyin_val.find('copyin(') + len('copyin(')
+        #:set copyin_val = copyin_val[:index] + 'readonly:' + copyin_val[index:]
     #:endif
     $:copyin_val
 #:enddef
@@ -137,21 +152,26 @@
 
 #:def GEN_REDUCTION_STR(reduction, reductionOp)
     #:if reduction is not None and reductionOp is not None
-        #:if isinstance(reduction, list) and isinstance(reductionOp, list)
-            $:ASSERT_LIST(reduction, list)
-            $:ASSERT_LIST(reductionOp, str)
-            #:assert all(len(element) != 0 for element in reduction)
-            #:assert all(isinstance(element, str) for sublist in reduction for element in sublist)
-            #:assert len(reduction) == len(reductionOp)
-            #:set reduction_list = ['reduction(' + op + ':' + ', '.join(red) + ') ' for (red, op) in zip(reduction, reductionOp)]
-            #:set reduction_val = ' '. join(reduction_list) + ' '
-        #:elif isinstance(reduction, list) and isinstance(reductionOp, str)
-            $:ASSERT_LIST(reduction, str)
-            #:assert isinstance(reductionOp, str)
-            #:set reduction_val = 'reduction(' + reductionOp + ':' + ', '.join(reduction) + ') '
-        #:else
-            #:stop 'Invalid datatypes for reduction or reductionOp. Must be list of lists and lists or list and str respectively'
-        #:endif
+        #:assert isinstance(reduction, str)
+        #:assert isinstance(reductionOp, str)
+        #:assert reduction[0] == '[' and reduction[-1] == ']'
+        #:assert reductionOp[0] == '[' and reductionOp[-1] == ']'
+        #:set reduction = reduction.replace(' ', '')
+        #:set reduction = reduction[1:-1]
+        #:set reduction_list = reduction.split('],')
+        #:set reduction_list = [str + ']' for str in reduction_list]
+        #:assert all(str[0] == '[' and str[-1] == ']' for str in reduction_list)
+
+        #:set reductionOp_list = [x.strip() for x in reductionOp.strip('[]').split(',')]
+        $:ASSERT_LIST(reduction_list, str)
+        $:ASSERT_LIST(reductionOp_list, str)
+        #:assert len(reduction_list) == len(reductionOp_list)
+        #:set reduction_val = ''
+        #:for i in range(len(reduction_list))
+            #:set temp_clause = GEN_PARENTHESES_CLAUSE('reduction', reduction_list[i]).strip('\n')
+            #:set ind = temp_clause.find('reduction(') + len('reduction(')
+            #:set reduction_val = reduction_val.strip('\n') + temp_clause[:ind] + reductionOp_list[i] + ':' + temp_clause[ind:]
+        #:endfor
     #:elif reduction is not None or reductionOp is not None
         #:stop 'Cannot set the reduction list or reduction operation without setting the other'
     #:else
@@ -235,12 +255,9 @@
         #:if not isinstance(function_name, str)
             #:stop "When inlining for Cray Compiler, function name must be given and given as a string"
         #:endif
-#ifdef _CRAYFTN
         #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
+        #:set cray_directive = '#ifdef _CRAYFTN\n' + cray_directive + '\n#else\n' + acc_directive + '\n#endif'
         $:cray_directive
-#else
-        $:acc_directive
-#endif
     #:else
         $:acc_directive
     #:endif
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index b521d1272f..b9b83f9fc1 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -20,7 +20,7 @@ module m_boundary_common
     implicit none
 
     type(scalar_field), dimension(:, :), allocatable :: bc_buffers
-    $:GPU_DECLARE(create=["bc_buffers"])
+    $:GPU_DECLARE(create='[bc_buffers]')
 
     real(wp) :: bcxb, bcxe, bcyb, bcye, bczb, bcze
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 3ff223844f..b1cac0a70b 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -99,7 +99,7 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["Ys", "omega"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega]')
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index c9bf9a4ad2..1857a31cd8 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -18,7 +18,7 @@ contains
 
         real(wp) :: divergence
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["divergence"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[divergence]')
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index f4b1815687..e66b2951a6 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -40,7 +40,7 @@ module m_phase_change
     real(wp) :: A, B, C, D
     !> @}
 
-    $:GPU_DECLARE(create=["max_iter","pCr","TCr","mixM","lp","vp","A","B","C","D"])
+    $:GPU_DECLARE(create='[max_iter,pCr,TCr,mixM,lp,vp,A,B,C,D]')
 
 contains
 
@@ -87,20 +87,20 @@ contains
         real(wp) :: rho, rM, m1, m2, MCT !< total density, total reacting mass, individual reacting masses
         real(wp) :: TvF !< total volume fraction
 
-        $:GPU_DECLARE(create=["pS","pSOV","pSSL","TS","TSOV","TSSL","TSatOV","TSatSL"])
-        $:GPU_DECLARE(create=["rhoe","dynE","rhos","rho","rM","m1","m2","MCT","TvF"])
+        $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]')
+        $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]')
 
         real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
-        $:GPU_DECLARE(create=["p_infOV","p_infpT","p_infSL","sk","hk","gk","ek","rhok"])
+        $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]')
 
         !< Generic loop iterators
         integer :: i, j, k, l
 
         ! starting equilibrium solver
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["p_infOV", "p_infpT", "p_infSL", &
-            "sk", "hk", "gk", "ek", "rhok", "pS", "pSOV", "pSSL", &
-            "TS", "TSOV", "TSatOV", "TSatSL", "TSSL", "rhoe", &
-            "dynE", "rhos", "rho", "rM", "m1", "m2", "MCT", "TvF"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[p_infOV, p_infpT, p_infSL, &
+            sk, hk, gk, ek, rhok,pS, pSOV, pSSL, &
+            TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, &
+            dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
         do j = 0, m
             do k = 0, n
                 do l = 0, p
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 08ce822a83..2aadab4aaf 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -50,16 +50,16 @@ module m_variables_conversion
     !! In simulation, gammas, pi_infs, and qvs are already declared in m_global_variables
 #ifndef MFC_SIMULATION
     real(wp), allocatable, public, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    $:GPU_DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
+    $:GPU_DECLARE(create='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps]')
 #endif
 
     real(wp), allocatable, dimension(:) :: Gs
     integer, allocatable, dimension(:) :: bubrs
     real(wp), allocatable, dimension(:, :) :: Res
-    $:GPU_DECLARE(create=["bubrs","Gs","Res"])
+    $:GPU_DECLARE(create='[bubrs,Gs,Res]')
 
     integer :: is1b, is2b, is3b, is1e, is2e, is3e
-    $:GPU_DECLARE(create=["is1b","is2b","is3b","is1e","is2e","is3e"])
+    $:GPU_DECLARE(create='[is1b,is2b,is3b,is1e,is2e,is3e]')
 
     real(wp), allocatable, dimension(:, :, :), public :: rho_sf !< Scalar density function
     real(wp), allocatable, dimension(:, :, :), public :: gamma_sf !< Scalar sp. heat ratio function
@@ -610,7 +610,7 @@ contains
 
         integer :: i, j
 
-        $:GPU_ENTER_DATA(copyin=["is1b","is1e","is2b","is2e","is3b","is3e"])
+        $:GPU_ENTER_DATA(copyin='[is1b,is1e,is2b,is2e,is3b,is3e]')
 
 #ifdef MFC_SIMULATION
         @:ALLOCATE(gammas (1:num_fluids))
@@ -642,7 +642,7 @@ contains
             qvs(i) = fluid_pp(i)%qv
             qvps(i) = fluid_pp(i)%qvp
         end do
-        $:GPU_UPDATE(device=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps","Gs"])
+        $:GPU_UPDATE(device='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps,Gs]')
 
 #ifdef MFC_SIMULATION
 
@@ -654,7 +654,7 @@ contains
                 end do
             end do
 
-            $:GPU_UPDATE(device=["Res","Re_idx","Re_size"])
+            $:GPU_UPDATE(device='[Res,Re_idx,Re_size]')
         end if
 #endif
 
@@ -668,7 +668,7 @@ contains
             do i = 1, nb
                 bubrs(i) = bub_idx%rs(i)
             end do
-            $:GPU_UPDATE(device=["bubrs"])
+            $:GPU_UPDATE(device='[bubrs]')
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -869,9 +869,9 @@ contains
             end if
         #:endif
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "Re_K", &
-            "nRtmp", "rho_K", "gamma_K", "pi_inf_K", "qv_K", &
-            "dyn_pres_K", "rhoYks", "B"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, &
+            & nRtmp, rho_K, gamma_K, pi_inf_K,qv_K, &
+            & dyn_pres_K, rhoYks, B]')
         do l = ibounds(3)%beg, ibounds(3)%end
             do k = ibounds(2)%beg, ibounds(2)%end
                 do j = ibounds(1)%beg, ibounds(1)%end
@@ -1461,13 +1461,13 @@ contains
         is2b = is2%beg; is2e = is2%end
         is3b = is3%beg; is3e = is3%end
 
-        $:GPU_UPDATE(device=["is1b","is2b","is3b","is1e","is2e","is3e"])
+        $:GPU_UPDATE(device='[is1b,is2b,is3b,is1e,is2e,is3e]')
 
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_K", "vel_K", &
-            "alpha_K", "Re_K", "Y_K"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_K, vel_K, &
+            & alpha_K, Re_K, Y_K]')
         do l = is3b, is3e
             do k = is2b, is2e
                 do j = is1b, is1e
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index e319c0e963..c604d2bb35 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -23,43 +23,43 @@ module m_acoustic_src
     private; public :: s_initialize_acoustic_src, s_precalculate_acoustic_spatial_sources, s_acoustic_src_calculations
 
     integer, allocatable, dimension(:) :: pulse, support
-    $:GPU_DECLARE(create=["pulse","support"])
+    $:GPU_DECLARE(create='[pulse,support]')
 
     logical, allocatable, dimension(:) :: dipole
-    $:GPU_DECLARE(create=["dipole"])
+    $:GPU_DECLARE(create='[dipole]')
 
     real(wp), allocatable, target, dimension(:, :) :: loc_acoustic
-    $:GPU_DECLARE(create=["loc_acoustic"])
+    $:GPU_DECLARE(create='[loc_acoustic]')
 
     real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency
     real(wp), allocatable, dimension(:) :: gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
-    $:GPU_DECLARE(create=["mag","length","height","wavelength","frequency"])
-    $:GPU_DECLARE(create=["gauss_sigma_dist","gauss_sigma_time","npulse","dir","delay"])
+    $:GPU_DECLARE(create='[mag,length,height,wavelength,frequency]')
+    $:GPU_DECLARE(create='[gauss_sigma_dist,gauss_sigma_time,npulse,dir,delay]')
 
     real(wp), allocatable, dimension(:) :: foc_length, aperture
-    $:GPU_DECLARE(create=["foc_length","aperture"])
+    $:GPU_DECLARE(create='[foc_length,aperture]')
 
     real(wp), allocatable, dimension(:) :: element_spacing_angle, element_polygon_ratio, rotate_angle
-    $:GPU_DECLARE(create=["element_spacing_angle","element_polygon_ratio","rotate_angle"])
+    $:GPU_DECLARE(create='[element_spacing_angle,element_polygon_ratio,rotate_angle]')
 
     real(wp), allocatable, dimension(:) :: bb_bandwidth, bb_lowest_freq
-    $:GPU_DECLARE(create=["bb_bandwidth","bb_lowest_freq"])
+    $:GPU_DECLARE(create='[bb_bandwidth,bb_lowest_freq]')
 
     integer, allocatable, dimension(:) :: num_elements, element_on, bb_num_freq
-    $:GPU_DECLARE(create=["num_elements","element_on","bb_num_freq"])
+    $:GPU_DECLARE(create='[num_elements,element_on,bb_num_freq]')
 
     !> @name Acoustic source terms
     !> @{
     real(wp), allocatable, dimension(:, :, :) :: mass_src, e_src
     real(wp), allocatable, dimension(:, :, :, :) :: mom_src
     !> @}
-    $:GPU_DECLARE(create=["mass_src","e_src","mom_src"])
+    $:GPU_DECLARE(create='[mass_src,e_src,mom_src]')
 
     integer, dimension(:), allocatable :: source_spatials_num_points !< Number of non-zero source grid points for each source
-    $:GPU_DECLARE(create=["source_spatials_num_points"])
+    $:GPU_DECLARE(create='[source_spatials_num_points]')
 
     type(source_spatial_type), dimension(:), allocatable :: source_spatials !< Data of non-zero source grid points for each source
-    $:GPU_DECLARE(create=["source_spatials"])
+    $:GPU_DECLARE(create='[source_spatials]')
 
 contains
 
@@ -110,12 +110,12 @@ contains
                 delay(i) = acoustic(i)%delay
             end if
         end do
-        $:GPU_UPDATE(device=["loc_acoustic","mag","dipole","support","length", &
-            & "height","wavelength","frequency","gauss_sigma_dist", &
-            & "gauss_sigma_time","foc_length","aperture","npulse","pulse", &
-            & "dir","delay","element_polygon_ratio","rotate_angle", &
-            & "element_spacing_angle","num_elements","element_on", &
-            & "bb_num_freq","bb_bandwidth","bb_lowest_freq"])
+        $:GPU_UPDATE(device='[loc_acoustic,mag,dipole,support,length, &
+            & height,wavelength,frequency,gauss_sigma_dist, &
+            & gauss_sigma_time,foc_length,aperture,npulse,pulse, &
+            & dir,delay,element_polygon_ratio,rotate_angle, &
+            & element_spacing_angle,num_elements,element_on, &
+            & bb_num_freq,bb_bandwidth,bb_lowest_freq]')
 
         @:ALLOCATE(mass_src(0:m, 0:n, 0:p))
         @:ALLOCATE(mom_src(1:num_vels, 0:m, 0:n, 0:p))
@@ -219,7 +219,7 @@ contains
 
             deallocate (phi_rn)
 
-            $:GPU_PARALLEL_LOOP(private=["myalpha","myalpha_rho"])
+            $:GPU_PARALLEL_LOOP(private='[myalpha,myalpha_rho]')
             do i = 1, num_points
                 j = source_spatials(ai)%coord(1, i)
                 k = source_spatials(ai)%coord(2, i)
@@ -473,14 +473,14 @@ contains
                 call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
             end if
 
-            $:GPU_UPDATE(device=["source_spatials(ai)%coord"])
-            $:GPU_UPDATE(device=["source_spatials(ai)%val"])
+            $:GPU_UPDATE(device='[source_spatials(ai)%coord]')
+            $:GPU_UPDATE(device='[source_spatials(ai)%val]')
             if (support(ai) >= 5) then
                 if (dim == 2) then
-                    $:GPU_UPDATE(device=["source_spatials(ai)%angle"])
+                    $:GPU_UPDATE(device='[source_spatials(ai)%angle]')
                 end if
                 if (dim == 3) then
-                    $:GPU_UPDATE(device=["source_spatials(ai)%xyz_to_r_ratios"])
+                    $:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
                 end if
             end if
 
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 794e7be075..d5f811d273 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -22,7 +22,7 @@ module m_body_forces
               s_finalize_body_forces_module
 
     real(wp), allocatable, dimension(:, :, :) :: rhoM
-    $:GPU_DECLARE(create=["rhoM"])
+    $:GPU_DECLARE(create='[rhoM]')
 
 contains
 
@@ -67,7 +67,7 @@ contains
             end if
         end if
 
-        $:GPU_UPDATE(device=["accel_bf"])
+        $:GPU_UPDATE(device='[accel_bf]')
 
     end subroutine s_compute_acceleration
 
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index 5589ee23aa..e484c62a68 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -41,7 +41,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_type(dir, loc)%sf
-                $:GPU_UPDATE(device=["bc_type(dir, loc)%sf"])
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -57,7 +57,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_buffers(dir, loc)%sf
-                $:GPU_UPDATE(device=["bc_buffers(dir, loc)%sf"])
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -105,7 +105,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), MPI_INTEGER, MPI_BC_TYPE_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_type(dir, loc)%sf, 1, MPI_BC_TYPE_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_type(dir, loc)%sf)
-                $:GPU_UPDATE(device=["bc_type(dir, loc)%sf"])
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
 
@@ -115,7 +115,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_p, MPI_BC_BUFFER_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_buffers(dir, loc)%sf, 1, MPI_BC_BUFFER_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_buffers(dir, loc)%sf)
-                $:GPU_UPDATE(device=["bc_buffers(dir, loc)%sf"])
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
 
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 02e1932c8e..0443e8f182 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -20,7 +20,7 @@ module m_bubbles
     real(wp) :: chi_vw  !< Bubble wall properties (Ando 2010)
     real(wp) :: k_mw    !< Bubble wall properties (Ando 2010)
     real(wp) :: rho_mw  !< Bubble wall properties (Ando 2010)
-    $:GPU_DECLARE(create=["chi_vw","k_mw","rho_mw"])
+    $:GPU_DECLARE(create='[chi_vw,k_mw,rho_mw]')
 
 contains
 
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 201a17d6ab..c7d85c90f0 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -21,13 +21,13 @@ module m_bubbles_EE
 
     real(wp), allocatable, dimension(:, :, :) :: bub_adv_src
     real(wp), allocatable, dimension(:, :, :, :) :: bub_r_src, bub_v_src, bub_p_src, bub_m_src
-    $:GPU_DECLARE(create=["bub_adv_src","bub_r_src","bub_v_src","bub_p_src","bub_m_src"])
+    $:GPU_DECLARE(create='[bub_adv_src,bub_r_src,bub_v_src,bub_p_src,bub_m_src]')
 
     type(scalar_field) :: divu !< matrix for div(u)
-    $:GPU_DECLARE(create=["divu"])
+    $:GPU_DECLARE(create='[divu]')
 
     integer, allocatable, dimension(:) :: rs, vs, ms, ps
-    $:GPU_DECLARE(create=["rs","vs","ms","ps"])
+    $:GPU_DECLARE(create='[rs,vs,ms,ps]')
 
 contains
 
@@ -51,9 +51,9 @@ contains
             end if
         end do
 
-        $:GPU_UPDATE(device=["rs", "vs"])
+        $:GPU_UPDATE(device='[rs, vs]')
         if (.not. polytropic) then
-            $:GPU_UPDATE(device=["ps", "ms"])
+            $:GPU_UPDATE(device='[ps, ms]')
         end if
 
         @:ALLOCATE(divu%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@@ -190,9 +190,9 @@ contains
         end do
 
         adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["Rtmp", "Vtmp", "myalpha_rho", "myalpha"], &
-            reduction=["adap_dt_stop_max"], reductionOp="MAX", &
-            copy=["adap_dt_stop_max"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[Rtmp, Vtmp, myalpha_rho, myalpha]', &
+            & reduction='[[adap_dt_stop_max]]', reductionOp='[MAX]', &
+            & copy='[adap_dt_stop_max]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 71706d18f8..4489c2c850 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -32,26 +32,26 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:) :: bub_R0            !< Initial bubble radius
     real(wp), allocatable, dimension(:) :: Rmax_stats        !< Maximum radius
     real(wp), allocatable, dimension(:) :: Rmin_stats        !< Minimum radius
-    $:GPU_DECLARE(create=["lag_id", "bub_R0", "Rmax_stats", "Rmin_stats"])
+    $:GPU_DECLARE(create='[lag_id, bub_R0, Rmax_stats, Rmin_stats]')
 
     real(wp), allocatable, dimension(:) :: gas_mg            !< Bubble's gas mass
     real(wp), allocatable, dimension(:) :: gas_betaT         !< heatflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: gas_betaC         !< massflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: bub_dphidt        !< subgrid velocity potential (Maeda & Colonius, 2018)
-    $:GPU_DECLARE(create=["gas_mg", "gas_betaT", "gas_betaC", "bub_dphidt"])
+    $:GPU_DECLARE(create='[gas_mg, gas_betaT, gas_betaC, bub_dphidt]')
 
     !(nBub, 1 -> actual val or 2 -> temp val)
     real(wp), allocatable, dimension(:, :) :: gas_p          !< Pressure in the bubble
     real(wp), allocatable, dimension(:, :) :: gas_mv         !< Vapor mass in the bubble
     real(wp), allocatable, dimension(:, :) :: intfc_rad      !< Bubble radius
     real(wp), allocatable, dimension(:, :) :: intfc_vel      !< Velocity of the bubble interface
-    $:GPU_DECLARE(create=["gas_p", "gas_mv", "intfc_rad", "intfc_vel"])
+    $:GPU_DECLARE(create='[gas_p, gas_mv, intfc_rad, intfc_vel]')
     !(nBub, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
     real(wp), allocatable, dimension(:, :, :) :: mtn_pos     !< Bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_posPrev !< Bubble's previous position
     real(wp), allocatable, dimension(:, :, :) :: mtn_vel     !< Bubble's velocity
     real(wp), allocatable, dimension(:, :, :) :: mtn_s       !< Bubble's computational cell position in real format
-    $:GPU_DECLARE(create=["mtn_pos", "mtn_posPrev", "mtn_vel", "mtn_s"])
+    $:GPU_DECLARE(create='[mtn_pos, mtn_posPrev, mtn_vel, mtn_s]')
     !(nBub, 1-> x or 2->y or 3 ->z, time-stage)
     real(wp), allocatable, dimension(:, :) :: intfc_draddt   !< Time derivative of bubble's radius
     real(wp), allocatable, dimension(:, :) :: intfc_dveldt   !< Time derivative of bubble's interface velocity
@@ -59,18 +59,18 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:, :) :: gas_dmvdt      !< Time derivative of the vapor mass in the bubble
     real(wp), allocatable, dimension(:, :, :) :: mtn_dposdt  !< Time derivative of the bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_dveldt  !< Time derivative of the bubble's velocity
-    $:GPU_DECLARE(create=["intfc_draddt", "intfc_dveldt", "gas_dpdt", "gas_dmvdt", "mtn_dposdt", "mtn_dveldt"])
+    $:GPU_DECLARE(create='[intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt]')
 
     integer, private :: lag_num_ts                                  !<  Number of time stages in the time-stepping scheme
 
-    $:GPU_DECLARE(create=["lag_num_ts"])
+    $:GPU_DECLARE(create='[lag_num_ts]')
 
     integer :: nBubs                            !< Number of bubbles in the local domain
     real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
     type(vector_field) :: q_beta                !< Projection of the lagrangian particles in the Eulerian framework
     integer :: q_beta_idx                       !< Size of the q_beta vector field
 
-    $:GPU_DECLARE(create=["nBubs","Rmax_glb","Rmin_glb","q_beta","q_beta_idx"])
+    $:GPU_DECLARE(create='[nBubs,Rmax_glb,Rmin_glb,q_beta,q_beta_idx]')
 
 contains
 
@@ -100,7 +100,7 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
-        $:GPU_UPDATE(device=["lag_num_ts", "q_beta_idx"])
+        $:GPU_UPDATE(device='[lag_num_ts, q_beta_idx]')
 
         @:ALLOCATE(q_beta%vf(1:q_beta_idx))
 
@@ -248,19 +248,19 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
-        $:GPU_UPDATE(device=["bubbles_lagrange", "lag_params"])
+        $:GPU_UPDATE(device='[bubbles_lagrange, lag_params]')
 
-        $:GPU_UPDATE(device=["lag_id","bub_R0","Rmax_stats","Rmin_stats","gas_mg", &
-            & "gas_betaT","gas_betaC","bub_dphidt","gas_p","gas_mv", &
-            & "intfc_rad","intfc_vel","mtn_pos","mtn_posPrev","mtn_vel", &
-            & "mtn_s","intfc_draddt","intfc_dveldt","gas_dpdt","gas_dmvdt", &
-            & "mtn_dposdt","mtn_dveldt","nBubs"])
+        $:GPU_UPDATE(device='[lag_id,bub_R0,Rmax_stats,Rmin_stats,gas_mg, &
+            & gas_betaT,gas_betaC,bub_dphidt,gas_p,gas_mv, &
+            & intfc_rad,intfc_vel,mtn_pos,mtn_posPrev,mtn_vel, &
+            & mtn_s,intfc_draddt,intfc_dveldt,gas_dpdt,gas_dmvdt, &
+            & mtn_dposdt,mtn_dveldt,nBubs]')
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
-        $:GPU_UPDATE(device=["Rmax_glb", "Rmin_glb"])
+        $:GPU_UPDATE(device='[Rmax_glb, Rmin_glb]')
 
-        $:GPU_UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
+        $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]')
 
         !Populate temporal variables
         call s_transfer_data_to_tmp()
@@ -527,7 +527,7 @@ contains
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
             ! Calculate velocity potentials (valid for one bubble per cell)
-            $:GPU_PARALLEL_LOOP(private=["k","cell"])
+            $:GPU_PARALLEL_LOOP(private='[k,cell]')
             do k = 1, nBubs
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
@@ -548,9 +548,9 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(private=["k","myalpha_rho","myalpha","Re","cell"], &
-            & reduction=["adap_dt_stop_max"],reductionOp="MAX", &
-            & copy=["adap_dt_stop_max"],copyin=["stage"])
+        $:GPU_PARALLEL_LOOP(private='[k,myalpha_rho,myalpha,Re,cell]', &
+            & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
+            & copy='[adap_dt_stop_max]',copyin='[stage]')
         do k = 1, nBubs
             ! Keller-Miksis model
 
@@ -618,7 +618,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         ! Bubbles remain in a fixed position
-        $:GPU_PARALLEL_LOOP(collapse=2, private=["k"], copyin=["stage"])
+        $:GPU_PARALLEL_LOOP(collapse=2, private='[k]', copyin='[stage]')
         do k = 1, nBubs
             do l = 1, 3
                 mtn_dposdt(k, l, stage) = 0._wp
@@ -1027,7 +1027,7 @@ contains
         integer :: k
 
         if (time_stepper == 1) then ! 1st order TVD RK
-            $:GPU_PARALLEL_LOOP(private=["k"])
+            $:GPU_PARALLEL_LOOP(private='[k]')
             do k = 1, nBubs
                 !u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1043,13 +1043,13 @@ contains
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
             if (lag_params%write_bubbles) then
-                $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                 call s_write_lag_particles(mytime)
             end if
 
         elseif (time_stepper == 2) then ! 2nd order TVD RK
             if (stage == 1) then
-                $:GPU_PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1061,7 +1061,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                $:GPU_PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
@@ -1077,7 +1077,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1085,7 +1085,7 @@ contains
 
         elseif (time_stepper == 3) then ! 3rd order TVD RK
             if (stage == 1) then
-                $:GPU_PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1097,7 +1097,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                $:GPU_PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
@@ -1108,7 +1108,7 @@ contains
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
                 end do
             elseif (stage == 3) then
-                $:GPU_PARALLEL_LOOP(private=["k"])
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3))
@@ -1124,7 +1124,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1194,7 +1194,7 @@ contains
 
         integer :: k
 
-        $:GPU_PARALLEL_LOOP(private=["k"])
+        $:GPU_PARALLEL_LOOP(private='[k]')
         do k = 1, nBubs
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
@@ -1416,9 +1416,9 @@ contains
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
-        $:GPU_PARALLEL_LOOP(collapse=3, reduction=[["lag_vol", "lag_void_avg"], &
-            ["lag_void_max"]], reductionOp=["+", "MAX"], &
-            copy=["lag_vol", "lag_void_avg", "lag_void_max"])
+        $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[lag_vol, lag_void_avg], &
+            & [lag_void_max]]', reductionOp='[+, MAX]', &
+            & copy='[lag_vol, lag_void_avg, lag_void_max]')
         do k = 0, p
             do j = 0, n
                 do i = 0, m
@@ -1602,8 +1602,8 @@ contains
 
         integer :: k
 
-        $:GPU_PARALLEL_LOOP(reduction=[["Rmax_glb"], ["Rmin_glb"]], &
-            & reductionOp=["MAX", "MIN"], copy=["Rmax_glb","Rmin_glb"])
+        $:GPU_PARALLEL_LOOP(reduction='[[Rmax_glb], [Rmin_glb]]', &
+            & reductionOp='[MAX, MIN]', copy='[Rmax_glb,Rmin_glb]')
         do k = 1, nBubs
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k))
@@ -1622,7 +1622,7 @@ contains
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
 
-        $:GPU_UPDATE(host=["Rmax_glb","Rmin_glb"])
+        $:GPU_UPDATE(host='[Rmax_glb,Rmin_glb]')
 
         open (13, FILE=trim(file_loc), FORM='formatted', position='rewind')
         write (13, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
@@ -1675,7 +1675,7 @@ contains
         end do
 
         nBubs = nBubs - 1
-        $:GPU_UPDATE(device=["nBubs"])
+        $:GPU_UPDATE(device='[nBubs]')
 
     end subroutine s_remove_lag_bubble
 
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index fe644244cc..daac6b8238 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -55,7 +55,7 @@ contains
         real(wp), dimension(3) :: s_coord
         integer :: l
 
-        $:GPU_PARALLEL_LOOP(private=["l","s_coord","cell"])
+        $:GPU_PARALLEL_LOOP(private='[l,s_coord,cell]')
         do l = 1, nBubs
 
             volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
@@ -120,7 +120,7 @@ contains
         smearGridz = smearGrid
         if (p == 0) smearGridz = 1
 
-        $:GPU_PARALLEL_LOOP(private=["nodecoord","l","s_coord","cell","center"], copyin=["smearGrid","smearGridz"])
+        $:GPU_PARALLEL_LOOP(private='[nodecoord,l,s_coord,cell,center]', copyin='[smearGrid,smearGridz]')
         do l = 1, nBubs
             nodecoord(1:3) = 0
             center(1:3) = 0._wp
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 07b859e085..83ce6a864c 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -48,7 +48,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
-    $:GPU_DECLARE(create=["q_prim_rsx_vf","q_prim_rsy_vf","q_prim_rsz_vf"])
+    $:GPU_DECLARE(create='[q_prim_rsx_vf,q_prim_rsy_vf,q_prim_rsz_vf]')
 
     type(scalar_field), allocatable, dimension(:) :: F_rs_vf, F_src_rs_vf !<
 
@@ -58,7 +58,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsx_vf, F_src_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsy_vf, F_src_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsz_vf, F_src_rsz_vf !<
-    $:GPU_DECLARE(create=["F_rsx_vf","F_src_rsx_vf","F_rsy_vf","F_src_rsy_vf","F_rsz_vf","F_src_rsz_vf"])
+    $:GPU_DECLARE(create='[F_rsx_vf,F_src_rsx_vf,F_rsy_vf,F_src_rsy_vf,F_rsz_vf,F_src_rsz_vf]')
 
     !! There is a CCE bug that is causing some subset of these variables to interfere
     !! with variables of the same name in m_riemann_solvers.fpp, and giving this versions
@@ -69,14 +69,14 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf_l, flux_src_rsx_vf_l !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf_l, flux_src_rsy_vf_l
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf_l, flux_src_rsz_vf_l
-    $:GPU_DECLARE(create=["flux_rsx_vf_l","flux_src_rsx_vf_l","flux_rsy_vf_l","flux_src_rsy_vf_l","flux_rsz_vf_l","flux_src_rsz_vf_l"])
+    $:GPU_DECLARE(create='[flux_rsx_vf_l,flux_src_rsx_vf_l,flux_rsy_vf_l,flux_src_rsy_vf_l,flux_rsz_vf_l,flux_src_rsz_vf_l]')
 
     real(wp) :: c           !< Cell averaged speed of sound
     real(wp), dimension(2) :: Re          !< Cell averaged Reynolds numbers
-    $:GPU_DECLARE(create=["c","Re"])
+    $:GPU_DECLARE(create='[c,Re]')
 
     real(wp) :: dpres_ds !< Spatial derivatives in s-dir of pressure
-    $:GPU_DECLARE(create=["dpres_ds"])
+    $:GPU_DECLARE(create='[dpres_ds]')
 
     real(wp), allocatable, dimension(:) :: ds !< Cell-width distribution in the s-direction
 
@@ -96,21 +96,21 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_y !< Polynomial interpolant coefficients in y-dir
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_z !< Polynomial interpolant coefficients in z-dir
 
-    $:GPU_DECLARE(create=["ds","fd_coef_x","fd_coef_y","fd_coef_z","pi_coef_x","pi_coef_y","pi_coef_z"])
+    $:GPU_DECLARE(create='[ds,fd_coef_x,fd_coef_y,fd_coef_z,pi_coef_x,pi_coef_y,pi_coef_z]')
 
     !! The first dimension of the array identifies the polynomial, the
     !! second dimension identifies the position of its coefficients and the last
     !! dimension denotes the location of the CBC.
 
     type(int_bounds_info) :: is1, is2, is3 !< Indical bounds in the s1-, s2- and s3-directions
-    $:GPU_DECLARE(create=["is1","is2","is3"])
+    $:GPU_DECLARE(create='[is1,is2,is3]')
 
     integer :: dj
     integer :: bcxb, bcxe, bcyb, bcye, bczb, bcze
     integer :: cbc_dir, cbc_loc
     integer :: flux_cbc_index
-    $:GPU_DECLARE(create=["dj","bcxb","bcxe","bcyb","bcye","bczb","bcze"])
-    $:GPU_DECLARE(create=["cbc_dir", "cbc_loc","flux_cbc_index"])
+    $:GPU_DECLARE(create='[dj,bcxb,bcxe,bcyb,bcye,bczb,bcze]')
+    $:GPU_DECLARE(create='[cbc_dir, cbc_loc,flux_cbc_index]')
 
     !! GRCBC inputs for subsonic inflow and outflow conditions consisting of
     !! inflow velocities, pressure, density and void fraction as well as
@@ -119,9 +119,9 @@ module m_cbc
     real(wp), allocatable, dimension(:) :: pres_in, pres_out, Del_in, Del_out
     real(wp), allocatable, dimension(:, :) :: vel_in, vel_out
     real(wp), allocatable, dimension(:, :) :: alpha_rho_in, alpha_in
-    $:GPU_DECLARE(create=["pres_in","pres_out","Del_in","Del_out"])
-    $:GPU_DECLARE(create=["vel_in","vel_out"])
-    $:GPU_DECLARE(create=["alpha_rho_in","alpha_in"])
+    $:GPU_DECLARE(create='[pres_in,pres_out,Del_in,Del_out]')
+    $:GPU_DECLARE(create='[vel_in,vel_out]')
+    $:GPU_DECLARE(create='[alpha_rho_in,alpha_in]')
 
 contains
 
@@ -138,7 +138,7 @@ contains
         else
             flux_cbc_index = adv_idx%end
         end if
-        $:GPU_UPDATE(device=["flux_cbc_index"])
+        $:GPU_UPDATE(device='[flux_cbc_index]')
 
         call s_any_cbc_boundaries(is_cbc)
 
@@ -388,8 +388,8 @@ contains
 
         end if
 
-        $:GPU_UPDATE(device=["fd_coef_x","fd_coef_y","fd_coef_z", &
-            & "pi_coef_x","pi_coef_y","pi_coef_z"])
+        $:GPU_UPDATE(device='[fd_coef_x,fd_coef_y,fd_coef_z, &
+            & pi_coef_x,pi_coef_y,pi_coef_z]')
 
         ! Associating the procedural pointer to the appropriate subroutine
         ! that will be utilized in the conversion to the mixture variables
@@ -397,20 +397,20 @@ contains
         bcxb = bc_x%beg
         bcxe = bc_x%end
 
-        $:GPU_UPDATE(device=["bcxb", "bcxe"])
+        $:GPU_UPDATE(device='[bcxb, bcxe]')
 
         if (n > 0) then
             bcyb = bc_y%beg
             bcye = bc_y%end
 
-            $:GPU_UPDATE(device=["bcyb", "bcye"])
+            $:GPU_UPDATE(device='[bcyb, bcye]')
         end if
 
         if (p > 0) then
             bczb = bc_z%beg
             bcze = bc_z%end
 
-            $:GPU_UPDATE(device=["bczb", "bcze"])
+            $:GPU_UPDATE(device='[bczb, bcze]')
         end if
 
         ! Allocate GRCBC inputs
@@ -442,8 +442,8 @@ contains
                 end do
             end if
         #:endfor
-        $:GPU_UPDATE(device=["vel_in","vel_out","pres_in","pres_out", &
-            & "Del_in","Del_out","alpha_rho_in","alpha_in"])
+        $:GPU_UPDATE(device='[vel_in,vel_out,pres_in,pres_out, &
+            & Del_in,Del_out,alpha_rho_in,alpha_in]')
 
     end subroutine s_initialize_cbc_module
 
@@ -607,7 +607,7 @@ contains
 
         end if
 
-        $:GPU_UPDATE(device=["ds"])
+        $:GPU_UPDATE(device='[ds]')
 
     end subroutine s_associate_cbc_coefficients_pointers
 
@@ -683,7 +683,7 @@ contains
         cbc_dir = cbc_dir_norm
         cbc_loc = cbc_loc_norm
 
-        $:GPU_UPDATE(device=["cbc_dir", "cbc_loc"])
+        $:GPU_UPDATE(device='[cbc_dir, cbc_loc]')
 
         call s_initialize_cbc(q_prim_vf, flux_vf, flux_src_vf, &
                               ix, iy, iz)
@@ -775,10 +775,10 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
-                $:GPU_PARALLEL_LOOP(collapse=2, private=["alpha_rho", "vel", "adv", &
-                    "mf", "dvel_ds", "dadv_ds", "Re_cbc", "dalpha_rho_ds","dvel_dt", &
-                    "dadv_dt", "dalpha_rho_dt", "L", "lambda", "Ys", "dYs_dt", &
-                    "dYs_ds", "h_k", "Cp_i", "Gamma_i", "Xs"])
+                $:GPU_PARALLEL_LOOP(collapse=2, private='[alpha_rho, vel, adv, &
+                    & mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, &
+                    & dadv_dt, dalpha_rho_dt, L, lambda, Ys, dYs_dt, &
+                    & dYs_ds, h_k, Cp_i, Gamma_i, Xs]')
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
 
@@ -1163,8 +1163,8 @@ contains
         end if
 
         dj = max(0, cbc_loc)
-        $:GPU_UPDATE(device=["is1","is2","is3","dj"])
-        $:GPU_UPDATE(device=["dir_idx","dir_flg"])
+        $:GPU_UPDATE(device='[is1,is2,is3,dj]')
+        $:GPU_UPDATE(device='[dir_idx,dir_flg]')
 
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
@@ -1414,7 +1414,7 @@ contains
 
         ! Determining the indicial shift based on CBC location
         dj = max(0, cbc_loc)
-        $:GPU_UPDATE(device=["dj"])
+        $:GPU_UPDATE(device='[dj]')
 
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 2cec4cbe38..5a4c8e0aa1 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -54,14 +54,14 @@ module m_data_output
     real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
     real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
     real(wp), public, allocatable, dimension(:, :) :: c_mass
-    $:GPU_DECLARE(create=["icfl_sf","vcfl_sf","ccfl_sf","Rc_sf"])
+    $:GPU_DECLARE(create='[icfl_sf,vcfl_sf,ccfl_sf,Rc_sf]')
 
     real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
     real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
     real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
     real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    $:GPU_DECLARE(create=["icfl_max_loc","icfl_max_glb","vcfl_max_loc","vcfl_max_glb"])
-    $:GPU_DECLARE(create=["ccfl_max_loc","ccfl_max_glb","Rc_min_loc","Rc_min_glb"])
+    $:GPU_DECLARE(create='[icfl_max_loc,icfl_max_glb,vcfl_max_loc,vcfl_max_glb]')
+    $:GPU_DECLARE(create='[ccfl_max_loc,ccfl_max_glb,Rc_min_loc,Rc_min_glb]')
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
@@ -278,7 +278,7 @@ contains
         integer :: j, k, l
 
         ! Computing Stability Criteria at Current Time-step
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -302,10 +302,10 @@ contains
         ! Determining local stability criteria extrema at current time-step
 
 #ifdef _CRAYFTN
-        $:GPU_UPDATE(host=["icfl_sf"])
+        $:GPU_UPDATE(host='[icfl_sf]')
 
         if (viscous) then
-            $:GPU_UPDATE(host=["vcfl_sf","Rc_sf"])
+            $:GPU_UPDATE(host='[vcfl_sf,Rc_sf]')
         end if
 
         icfl_max_loc = maxval(icfl_sf)
@@ -526,7 +526,7 @@ contains
         if (prim_vars_wrt .or. (n == 0 .and. p == 0)) then
             call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
             do i = 1, sys_size
-                $:GPU_UPDATE(host=["q_prim_vf(i)%sf(:,:,:)"])
+                $:GPU_UPDATE(host='[q_prim_vf(i)%sf(:,:,:)]')
             end do
             ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
             if (qbmm) then
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 4646f574a1..b30bdbcf93 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -46,12 +46,12 @@ module m_fftw
     !! Filtered complex data in Fourier space
 
 #if defined(MFC_OpenACC)
-    $:GPU_DECLARE(create=["real_size","cmplx_size","x_size","batch_size","Nfq"])
+    $:GPU_DECLARE(create='[real_size,cmplx_size,x_size,batch_size,Nfq]')
 
     real(dp), allocatable, target :: data_real_gpu(:)
     complex(dp), allocatable, target :: data_cmplx_gpu(:)
     complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
-    $:GPU_DECLARE(create=["data_real_gpu","data_cmplx_gpu","data_fltr_cmplx_gpu"])
+    $:GPU_DECLARE(create='[data_real_gpu,data_cmplx_gpu,data_fltr_cmplx_gpu]')
 
 #if defined(__PGI)
     integer :: fwd_plan_gpu, bwd_plan_gpu
@@ -89,8 +89,8 @@ contains
         gpu_fft_size(1) = real_size; 
         iembed(1) = 0
         oembed(1) = 0
-        $:GPU_ENTER_DATA(copyin=["real_size","cmplx_size","x_size","sys_size","batch_size","Nfq"])
-        $:GPU_UPDATE(device=["real_size","cmplx_size","x_size","sys_size","batch_size"])
+        $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
+        $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
 #else
         ! Allocate input and output DFT data sizes
         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
@@ -171,7 +171,7 @@ contains
 #endif
         !$acc end host_data
         Nfq = 3
-        $:GPU_UPDATE(device=["Nfq"])
+        $:GPU_UPDATE(device='[Nfq]')
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
@@ -212,7 +212,7 @@ contains
                 end do
             end do
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate=["i"])
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
@@ -231,7 +231,7 @@ contains
             !$acc end host_data
 
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            $:GPU_UPDATE(device=["Nfq"])
+            $:GPU_UPDATE(device='[Nfq]')
 
             $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
@@ -251,7 +251,7 @@ contains
 #endif
             !$acc end host_data
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate=["i"])
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index b5a174e0e0..f869a31ac0 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -53,7 +53,7 @@ module m_global_parameters
     logical :: cyl_coord
     integer :: grid_geometry
     !> @}
-    $:GPU_DECLARE(create=["cyl_coord","grid_geometry"])
+    $:GPU_DECLARE(create='[cyl_coord,grid_geometry]')
 
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
@@ -76,7 +76,7 @@ module m_global_parameters
 
     real(wp) :: dt !< Size of the time-step
 
-    $:GPU_DECLARE(create=["x_cb","y_cb","z_cb","x_cc","y_cc","z_cc","dx","dy","dz","dt","m","n","p"])
+    $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number
     !! of time-step iterations between successive solution backups, respectively
@@ -90,7 +90,7 @@ module m_global_parameters
     real(wp) :: t_stop, t_save, cfl_target
     integer :: n_start
     !> @}
-    $:GPU_DECLARE(create=["cfl_target"])
+    $:GPU_DECLARE(create='[cfl_target]')
 
     logical :: cfl_adap_dt, cfl_const_dt, cfl_dt
 
@@ -158,7 +158,7 @@ module m_global_parameters
     logical :: bulk_stress   !< Bulk stresses
     logical :: cont_damage   !< Continuum damage modeling
 
-    $:GPU_DECLARE(create=["chemistry"])
+    $:GPU_DECLARE(create='[chemistry]')
 
     logical :: bodyForces
     logical :: bf_x, bf_y, bf_z !< body force toggle in three directions
@@ -169,27 +169,27 @@ module m_global_parameters
         #:endfor
     #:endfor
     real(wp), dimension(3) :: accel_bf
-    $:GPU_DECLARE(create=["accel_bf"])
+    $:GPU_DECLARE(create='[accel_bf]')
 
     integer :: cpu_start, cpu_end, cpu_rate
 
     #:if not MFC_CASE_OPTIMIZATION
-        $:GPU_DECLARE(create=["num_dims","num_vels","weno_polyn","weno_order"])
-        $:GPU_DECLARE(create=["weno_num_stencils","num_fluids","wenojs"])
-        $:GPU_DECLARE(create=["mapped_weno", "wenoz","teno","wenoz_q","mhd","relativity"])
+        $:GPU_DECLARE(create='[num_dims,num_vels,weno_polyn,weno_order]')
+        $:GPU_DECLARE(create='[weno_num_stencils,num_fluids,wenojs]')
+        $:GPU_DECLARE(create='[mapped_weno, wenoz,teno,wenoz_q,mhd,relativity]')
     #:endif
 
-    $:GPU_DECLARE(create=["mpp_lim","model_eqns","mixture_err","alt_soundspeed"])
-    $:GPU_DECLARE(create=["avg_state","mp_weno","weno_eps","teno_CT","hypoelasticity"])
-    $:GPU_DECLARE(create=["hyperelasticity","hyper_model","elasticity","low_Mach"])
-    $:GPU_DECLARE(create=["viscous","shear_stress","bulk_stress","cont_damage"])
+    $:GPU_DECLARE(create='[mpp_lim,model_eqns,mixture_err,alt_soundspeed]')
+    $:GPU_DECLARE(create='[avg_state,mp_weno,weno_eps,teno_CT,hypoelasticity]')
+    $:GPU_DECLARE(create='[hyperelasticity,hyper_model,elasticity,low_Mach]')
+    $:GPU_DECLARE(create='[viscous,shear_stress,bulk_stress,cont_damage]')
 
     logical :: relax          !< activate phase change
     integer :: relax_model    !< Relaxation model
     real(wp) :: palpha_eps     !< trigger parameter for the p relaxation procedure, phase change model
     real(wp) :: ptgalpha_eps   !< trigger parameter for the pTg relaxation procedure, phase change model
 
-    $:GPU_DECLARE(create=["relax", "relax_model", "palpha_eps","ptgalpha_eps"])
+    $:GPU_DECLARE(create='[relax, relax_model, palpha_eps,ptgalpha_eps]')
 
     integer :: num_bc_patches
     logical :: bc_io
@@ -199,9 +199,9 @@ module m_global_parameters
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
     !> @}
-    $:GPU_DECLARE(create=["bc_x%vb1", "bc_x%vb2", "bc_x%vb3", "bc_x%ve1", "bc_x%ve2", "bc_x%ve3"])
-    $:GPU_DECLARE(create=["bc_y%vb1", "bc_y%vb2", "bc_y%vb3", "bc_y%ve1", "bc_y%ve2", "bc_y%ve3"])
-    $:GPU_DECLARE(create=["bc_z%vb1", "bc_z%vb2", "bc_z%vb3", "bc_z%ve1", "bc_z%ve2", "bc_z%ve3"])
+    $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]')
+    $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]')
+    $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]')
 
     type(bounds_info) :: x_domain, y_domain, z_domain
     real(wp) :: x_a, y_a, z_a
@@ -255,20 +255,20 @@ module m_global_parameters
     integer :: c_idx                                   !< Index of color function
     integer :: damage_idx                              !< Index of damage state variable (D) for continuum damage model
     !> @}
-    $:GPU_DECLARE(create=["sys_size","E_idx","n_idx","bub_idx","alf_idx","gamma_idx"])
-    $:GPU_DECLARE(create=["pi_inf_idx","B_idx","stress_idx","xi_idx","b_size"])
-    $:GPU_DECLARE(create=["tensor_size","species_idx","c_idx"])
+    $:GPU_DECLARE(create='[sys_size,E_idx,n_idx,bub_idx,alf_idx,gamma_idx]')
+    $:GPU_DECLARE(create='[pi_inf_idx,B_idx,stress_idx,xi_idx,b_size]')
+    $:GPU_DECLARE(create='[tensor_size,species_idx,c_idx]')
 
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p).
     ! Stands for "InDices With INTerior".
     type(int_bounds_info) :: idwint(1:3)
-    $:GPU_DECLARE(create=["idwint"])
+    $:GPU_DECLARE(create='[idwint]')
 
     ! Cell Indices for the entire (local) domain. In simulation and post_process,
     ! this includes the buffer region. idwbuff and idwint are the same otherwise.
     ! Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
-    $:GPU_DECLARE(create=["idwbuff"])
+    $:GPU_DECLARE(create='[idwbuff]')
 
     !> @name The number of fluids, along with their identifying indexes, respectively,
     !! for which viscous effects, e.g. the shear and/or the volume Reynolds (Re)
@@ -278,7 +278,7 @@ module m_global_parameters
     integer, allocatable, dimension(:, :) :: Re_idx
     !> @}
 
-    $:GPU_DECLARE(create=["Re_size","Re_idx"])
+    $:GPU_DECLARE(create='[Re_size,Re_idx]')
 
     ! The WENO average (WA) flag regulates whether the calculation of any cell-
     ! average spatial derivatives is carried out in each cell by utilizing the
@@ -289,7 +289,7 @@ module m_global_parameters
     real(wp) :: wa_flg
     !> @{
 
-    $:GPU_DECLARE(create=["wa_flg"])
+    $:GPU_DECLARE(create='[wa_flg]')
 
     !> @name The coordinate direction indexes and flags (flg), respectively, for which
     !! the configurations will be determined with respect to a working direction
@@ -301,14 +301,14 @@ module m_global_parameters
     integer, dimension(3) :: dir_idx_tau !!used for hypoelasticity=true
     !> @}
 
-    $:GPU_DECLARE(create=["dir_idx","dir_flg","dir_idx_tau"])
+    $:GPU_DECLARE(create='[dir_idx,dir_flg,dir_idx_tau]')
 
     integer :: buff_size !<
     !! The number of cells that are necessary to be able to store enough boundary
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
-    $:GPU_DECLARE(create=["buff_size"])
+    $:GPU_DECLARE(create='[buff_size]')
 
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
@@ -319,7 +319,7 @@ module m_global_parameters
     !! Indices of shear stress components to reflect for boundary conditions.
     !! Size: (1:3, 1:shear_BC_flip_num) for (x/y/z, [indices])
 
-    $:GPU_DECLARE(create=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
+    $:GPU_DECLARE(create='[shear_num,shear_indices,shear_BC_flip_num,shear_BC_flip_indices]')
 
     ! END: Simulation Algorithm Parameters
 
@@ -339,7 +339,7 @@ module m_global_parameters
     !! The finite-difference number is given by MAX(1, fd_order/2). Essentially,
     !! it is a measure of the half-size of the finite-difference stencil for the
     !! selected order of accuracy.
-    $:GPU_DECLARE(create=["fd_order","fd_number"])
+    $:GPU_DECLARE(create='[fd_order,fd_number]')
 
     logical :: probe_wrt
     logical :: integral_wrt
@@ -352,7 +352,7 @@ module m_global_parameters
     !> @{
     real(wp) :: rhoref, pref
     !> @}
-    $:GPU_DECLARE(create=["rhoref","pref"])
+    $:GPU_DECLARE(create='[rhoref,pref]')
 
     !> @name Immersed Boundaries
     !> @{
@@ -367,7 +367,7 @@ module m_global_parameters
     !! the maximum allowable number of patches, num_patches_max, may be changed
     !! in the module m_derived_types.f90.
 
-    $:GPU_DECLARE(create=["ib","num_ibs","patch_ib"])
+    $:GPU_DECLARE(create='[ib,num_ibs,patch_ib]')
     !> @}
 
     !> @name Bubble modeling
@@ -382,31 +382,31 @@ module m_global_parameters
     real(wp) :: Ca       !< Cavitation number
     real(wp) :: Web      !< Weber number
     real(wp) :: Re_inv   !< Inverse Reynolds number
-    $:GPU_DECLARE(create=["R0ref","Ca","Web","Re_inv"])
+    $:GPU_DECLARE(create='[R0ref,Ca,Web,Re_inv]')
 
     real(wp), dimension(:), allocatable :: weight !< Simpson quadrature weights
     real(wp), dimension(:), allocatable :: R0     !< Bubble sizes
     real(wp), dimension(:), allocatable :: V0     !< Bubble velocities
-    $:GPU_DECLARE(create=["weight","R0","V0"])
+    $:GPU_DECLARE(create='[weight,R0,V0]')
 
     logical :: bubbles_euler      !< Bubbles euler on/off
     logical :: polytropic   !< Polytropic  switch
     logical :: polydisperse !< Polydisperse bubbles
-    $:GPU_DECLARE(create=["bubbles_euler","polytropic","polydisperse"])
+    $:GPU_DECLARE(create='[bubbles_euler,polytropic,polydisperse]')
 
     logical :: adv_n        !< Solve the number density equation and compute alpha from number density
     logical :: adap_dt      !< Adaptive step size control
     real(wp) :: adap_dt_tol !< Tolerance to control adaptive step size
-    $:GPU_DECLARE(create=["adv_n","adap_dt","adap_dt_tol"])
+    $:GPU_DECLARE(create='[adv_n,adap_dt,adap_dt_tol]')
 
     integer :: bubble_model !< Gilmore or Keller--Miksis bubble model
     integer :: thermal      !< Thermal behavior. 1 = adiabatic, 2 = isotherm, 3 = transfer
-    $:GPU_DECLARE(create=["bubble_model","thermal"])
+    $:GPU_DECLARE(create='[bubble_model,thermal]')
 
     real(wp), allocatable, dimension(:, :, :) :: ptil  !< Pressure modification
 
     real(wp) :: poly_sigma  !< log normal sigma for polydisperse PDF
-    $:GPU_DECLARE(create=["ptil", "poly_sigma"])
+    $:GPU_DECLARE(create='[ptil, poly_sigma]')
 
     logical :: qbmm      !< Quadrature moment method
     integer, parameter :: nmom = 6 !< Number of carried moments per R0 location
@@ -415,39 +415,39 @@ module m_global_parameters
     integer :: R0_type
 
     real(wp) :: pi_fac   !< Factor for artificial pi_inf
-    $:GPU_DECLARE(create=["qbmm", "nmomsp","nmomtot","R0_type","pi_fac"])
+    $:GPU_DECLARE(create='[qbmm, nmomsp,nmomtot,R0_type,pi_fac]')
 
     #:if not MFC_CASE_OPTIMIZATION
-        $:GPU_DECLARE(create=["nb"])
+        $:GPU_DECLARE(create='[nb]')
     #:endif
 
     type(scalar_field), allocatable, dimension(:) :: mom_sp
     type(scalar_field), allocatable, dimension(:, :, :) :: mom_3d
-    $:GPU_DECLARE(create=["mom_sp","mom_3d"])
+    $:GPU_DECLARE(create='[mom_sp,mom_3d]')
 
     !> @}
 
     type(chemistry_parameters) :: chem_params
-    $:GPU_DECLARE(create=["chem_params"])
+    $:GPU_DECLARE(create='[chem_params]')
 
     !> @name Physical bubble parameters (see Ando 2010, Preston 2007)
     !> @{
 
     real(wp) :: R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v
-    $:GPU_DECLARE(create=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw",])
-    $:GPU_DECLARE(create=["pv","M_n", "M_v","k_vl","k_nl","cp_n","cp_v"])
+    $:GPU_DECLARE(create='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw]')
+    $:GPU_DECLARE(create='[pv,M_n, M_v,k_vl,k_nl,cp_n,cp_v]')
 
     real(wp), dimension(:), allocatable :: k_n, k_v, pb0, mass_n0, mass_v0, Pe_T
     real(wp), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN
-    $:GPU_DECLARE(create=["k_n","k_v","pb0","mass_n0","mass_v0","Pe_T"])
-    $:GPU_DECLARE(create=["Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN"])
+    $:GPU_DECLARE(create='[k_n,k_v,pb0,mass_n0,mass_v0,Pe_T]')
+    $:GPU_DECLARE(create='[Re_trans_T,Re_trans_c,Im_trans_T,Im_trans_c,omegaN]')
 
     real(wp) :: mul0, ss, gamma_v, mu_v
     real(wp) :: gamma_m, gamma_n, mu_n
     real(wp) :: gam
     !> @}
 
-    $:GPU_DECLARE(create=["mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
+    $:GPU_DECLARE(create='[mul0,ss,gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
 
     !> @name Acoustic acoustic_source parameters
     !> @{
@@ -455,14 +455,14 @@ module m_global_parameters
     type(acoustic_parameters), dimension(num_probes_max) :: acoustic !< Acoustic source parameters
     integer :: num_source !< Number of acoustic sources
     !> @}
-    $:GPU_DECLARE(create=["acoustic_source","acoustic","num_source"])
+    $:GPU_DECLARE(create='[acoustic_source,acoustic,num_source]')
 
     !> @name Surface tension parameters
     !> @{
 
     real(wp) :: sigma
     logical :: surface_tension
-    $:GPU_DECLARE(create=["sigma","surface_tension"])
+    $:GPU_DECLARE(create='[sigma,surface_tension]')
     !> @}
 
     integer :: momxb, momxe
@@ -473,13 +473,13 @@ module m_global_parameters
     integer :: strxb, strxe
     integer :: chemxb, chemxe
     integer :: xibeg, xiend
-    $:GPU_DECLARE(create=["momxb","momxe","advxb","advxe","contxb","contxe"])
-    $:GPU_DECLARE(create=["intxb","intxe", "bubxb","bubxe"])
-    $:GPU_DECLARE(create=["strxb","strxe","chemxb","chemxe"])
-    $:GPU_DECLARE(create=["xibeg","xiend"])
+    $:GPU_DECLARE(create='[momxb,momxe,advxb,advxe,contxb,contxe]')
+    $:GPU_DECLARE(create='[intxb,intxe, bubxb,bubxe]')
+    $:GPU_DECLARE(create='[strxb,strxe,chemxb,chemxe]')
+    $:GPU_DECLARE(create='[xibeg,xiend]')
 
     real(wp), allocatable, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    $:GPU_DECLARE(create=["gammas","gs_min","pi_infs","ps_inf","cvs","qvs","qvps"])
+    $:GPU_DECLARE(create='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps]')
 
     real(wp) :: mytime       !< Current simulation time
     real(wp) :: finaltime    !< Final simulation time
@@ -490,25 +490,25 @@ module m_global_parameters
 
     type(pres_field), allocatable, dimension(:) :: mv_ts
 
-    $:GPU_DECLARE(create=["pb_ts","mv_ts"])
+    $:GPU_DECLARE(create='[pb_ts,mv_ts]')
 
     !> @name lagrangian subgrid bubble parameters
     !> @{!
     logical :: bubbles_lagrange                         !< Lagrangian subgrid bubble model switch
     type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
-    $:GPU_DECLARE(create=["bubbles_lagrange","lag_params"])
+    $:GPU_DECLARE(create='[bubbles_lagrange,lag_params]')
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
     logical :: powell !< Powell‐correction for div B = 0
-    $:GPU_DECLARE(create=["Bx0","powell"])
+    $:GPU_DECLARE(create='[Bx0,powell]')
 
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star        !< Stress threshold for continuum damage modeling
     real(wp) :: cont_damage_s   !< Exponent s for continuum damage modeling
     real(wp) :: alpha_bar       !< Damage rate factor for continuum damage modeling
-    $:GPU_DECLARE(create=["tau_star","cont_damage_s","alpha_bar"])
+    $:GPU_DECLARE(create='[tau_star,cont_damage_s,alpha_bar]')
     !> @}
 
 contains
@@ -801,10 +801,10 @@ contains
             else
                 weno_num_stencils = weno_polyn
             end if
-            $:GPU_UPDATE(device=["weno_polyn"])
-            $:GPU_UPDATE(device=["weno_num_stencils"])
-            $:GPU_UPDATE(device=["nb"])
-            $:GPU_UPDATE(device=["num_dims","num_vels","num_fluids"])
+            $:GPU_UPDATE(device='[weno_polyn]')
+            $:GPU_UPDATE(device='[weno_num_stencils]')
+            $:GPU_UPDATE(device='[nb]')
+            $:GPU_UPDATE(device='[num_dims,num_vels,num_fluids]')
         #:endif
 
         ! Initializing the number of fluids for which viscous effects will
@@ -1040,7 +1040,7 @@ contains
             if (Re_size(1) > 0._wp) shear_stress = .true.
             if (Re_size(2) > 0._wp) bulk_stress = .true.
 
-            $:GPU_UPDATE(device=["Re_size","viscous","shear_stress","bulk_stress"])
+            $:GPU_UPDATE(device='[Re_size,viscous,shear_stress,bulk_stress]')
 
             ! Bookkeeping the indexes of any viscous fluids and any pairs of
             ! fluids whose interface will support effects of surface tension
@@ -1096,7 +1096,7 @@ contains
                     ! y-dir: flip tau_xy and tau_yz
                     ! z-dir: flip tau_xz and tau_yz
                 end if
-                $:GPU_UPDATE(device=["shear_num","shear_indices","shear_BC_flip_num","shear_BC_flip_indices"])
+                $:GPU_UPDATE(device='[shear_num,shear_indices,shear_BC_flip_num,shear_BC_flip_indices]')
             end if
 
             if (hyperelasticity) then
@@ -1163,7 +1163,7 @@ contains
         ! cell-boundary values or otherwise, the unaltered left and right,
         ! WENO-reconstructed, cell-boundary values
         wa_flg = 0._wp; if (weno_avg) wa_flg = 1._wp
-        $:GPU_UPDATE(device=["wa_flg"])
+        $:GPU_UPDATE(device='[wa_flg]')
 
         ! Resort to default WENO-JS if no other WENO scheme is selected
         #:if not MFC_CASE_OPTIMIZATION
@@ -1173,7 +1173,7 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
         Np = 0
 
-        $:GPU_UPDATE(device=["Re_size"])
+        $:GPU_UPDATE(device='[Re_size]')
 
         if (elasticity) then
             fd_number = max(1, fd_order/2)
@@ -1191,7 +1191,7 @@ contains
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
                                            num_dims)
-        $:GPU_UPDATE(device=["idwint", "idwbuff"])
+        $:GPU_UPDATE(device='[idwint, idwbuff]')
 
         ! Configuring Coordinate Direction Indexes
         if (bubbles_euler) then
@@ -1201,7 +1201,7 @@ contains
                 & idwbuff(3)%beg:idwbuff(3)%end))
         end if
 
-        $:GPU_UPDATE(device=["fd_order", "fd_number"])
+        $:GPU_UPDATE(device='[fd_order, fd_number]')
 
         if (cyl_coord .neqv. .true.) then ! Cartesian grid
             grid_geometry = 1
@@ -1228,44 +1228,44 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
-        $:GPU_UPDATE(device=["momxb","momxe","advxb","advxe","contxb","contxe", &
-            & "bubxb","bubxe","intxb","intxe","sys_size","buff_size","E_idx", &
-            & "alf_idx","n_idx","adv_n","adap_dt","pi_fac","strxb","strxe", &
-            & "chemxb","chemxe","c_idx"])
-        $:GPU_UPDATE(device=["b_size","xibeg","xiend","tensor_size"])
+        $:GPU_UPDATE(device='[momxb,momxe,advxb,advxe,contxb,contxe, &
+            & bubxb,bubxe,intxb,intxe,sys_size,buff_size,E_idx, &
+            & alf_idx,n_idx,adv_n,adap_dt,pi_fac,strxb,strxe, &
+            & chemxb,chemxe,c_idx]')
+        $:GPU_UPDATE(device='[b_size,xibeg,xiend,tensor_size]')
 
-        $:GPU_UPDATE(device=["species_idx"])
-        $:GPU_UPDATE(device=["cfl_target","m","n","p"])
+        $:GPU_UPDATE(device='[species_idx]')
+        $:GPU_UPDATE(device='[cfl_target,m,n,p]')
 
-        $:GPU_UPDATE(device=["alt_soundspeed","acoustic_source","num_source"])
-        $:GPU_UPDATE(device=["dt","sys_size","buff_size","pref","rhoref", &
-            & "gamma_idx","pi_inf_idx","E_idx","alf_idx","stress_idx", &
-            & "mpp_lim","bubbles_euler","hypoelasticity","alt_soundspeed", &
-            & "avg_state","num_fluids","model_eqns","num_dims","num_vels", &
-            & "mixture_err","grid_geometry","cyl_coord","mp_weno","weno_eps", &
-            & "teno_CT","hyperelasticity","hyper_model","elasticity","xi_idx", &
-            & "B_idx","low_Mach"])
+        $:GPU_UPDATE(device='[alt_soundspeed,acoustic_source,num_source]')
+        $:GPU_UPDATE(device='[dt,sys_size,buff_size,pref,rhoref, &
+            & gamma_idx,pi_inf_idx,E_idx,alf_idx,stress_idx, &
+            & mpp_lim,bubbles_euler,hypoelasticity,alt_soundspeed, &
+            & avg_state,num_fluids,model_eqns,num_dims,num_vels, &
+            & mixture_err,grid_geometry,cyl_coord,mp_weno,weno_eps, &
+            & teno_CT,hyperelasticity,hyper_model,elasticity,xi_idx, &
+            & B_idx,low_Mach]')
 
-        $:GPU_UPDATE(device=["Bx0", "powell"])
+        $:GPU_UPDATE(device='[Bx0, powell]')
 
-        $:GPU_UPDATE(device=["cont_damage","tau_star","cont_damage_s","alpha_bar"])
+        $:GPU_UPDATE(device='[cont_damage,tau_star,cont_damage_s,alpha_bar]')
 
         #:if not MFC_CASE_OPTIMIZATION
-            $:GPU_UPDATE(device=["wenojs","mapped_weno","wenoz","teno"])
-            $:GPU_UPDATE(device=["wenoz_q"])
-            $:GPU_UPDATE(device=["mhd", "relativity"])
+            $:GPU_UPDATE(device='[wenojs,mapped_weno,wenoz,teno]')
+            $:GPU_UPDATE(device='[wenoz_q]')
+            $:GPU_UPDATE(device='[mhd, relativity]')
         #:endif
 
-        $:GPU_ENTER_DATA(copyin=["nb","R0ref","Ca","Web","Re_inv","weight","R0", &
-            & "V0","bubbles_euler","polytropic","polydisperse","qbmm","R0_type", &
-            & "ptil","bubble_model","thermal","poly_sigma"])
-        $:GPU_ENTER_DATA(copyin=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv", &
-            & "M_n","M_v","k_n","k_v","pb0","mass_n0","mass_v0","Pe_T", &
-            & "Re_trans_T","Re_trans_c","Im_trans_T","Im_trans_c","omegaN", &
-            & "mul0","ss","gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
-        $:GPU_ENTER_DATA(copyin=["dir_idx","dir_flg","dir_idx_tau"])
+        $:GPU_ENTER_DATA(copyin='[nb,R0ref,Ca,Web,Re_inv,weight,R0, &
+            & V0,bubbles_euler,polytropic,polydisperse,qbmm,R0_type, &
+            & ptil,bubble_model,thermal,poly_sigma]')
+        $:GPU_ENTER_DATA(copyin='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw,pv, &
+            & M_n,M_v,k_n,k_v,pb0,mass_n0,mass_v0,Pe_T, &
+            & Re_trans_T,Re_trans_c,Im_trans_T,Im_trans_c,omegaN, &
+            & mul0,ss,gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
+        $:GPU_ENTER_DATA(copyin='[dir_idx,dir_flg,dir_idx_tau]')
 
-        $:GPU_ENTER_DATA(copyin=["relax","relax_model","palpha_eps","ptgalpha_eps"])
+        $:GPU_ENTER_DATA(copyin='[relax,relax_model,palpha_eps,ptgalpha_eps]')
 
         ! Allocating grid variables for the x-, y- and z-directions
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index ad888c813d..6c11405c86 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -26,14 +26,14 @@ module m_hyperelastic
     !! The btensor at the cell-interior Gaussian quadrature points.
     !! These tensor is needed to be calculated once and make the code DRY.
     type(vector_field) :: btensor !<
-    $:GPU_DECLARE(create=["btensor"])
+    $:GPU_DECLARE(create='[btensor]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z
-    $:GPU_DECLARE(create=["fd_coeff_x","fd_coeff_y", "fd_coeff_z"])
+    $:GPU_DECLARE(create='[fd_coeff_x,fd_coeff_y, fd_coeff_z]')
     real(wp), allocatable, dimension(:) :: Gs
-    $:GPU_DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create='[Gs]')
 
 contains
 
@@ -59,7 +59,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:GPU_UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device='[Gs]')
 
         @:ALLOCATE(fd_coeff_x(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -72,16 +72,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, &
                                                       fd_number, fd_order)
-        $:GPU_UPDATE(device=["fd_coeff_x"])
+        $:GPU_UPDATE(device='[fd_coeff_x]')
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, &
                                                           fd_number, fd_order)
-            $:GPU_UPDATE(device=["fd_coeff_y"])
+            $:GPU_UPDATE(device='[fd_coeff_y]')
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, &
                                                           fd_number, fd_order)
-            $:GPU_UPDATE(device=["fd_coeff_z"])
+            $:GPU_UPDATE(device='[fd_coeff_z]')
         end if
 
     end subroutine s_initialize_hyperelastic_module
@@ -106,8 +106,8 @@ contains
         real(wp) :: G
         integer :: j, k, l, i, r
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_K", "alpha_rho_K", "rho", &
-            "gamma", "pi_inf", "qv", "G", "Re", "tensora", "tensorb"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, rho, &
+            & gamma, pi_inf, qv, G, Re, tensora, tensorb]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index b2b5d7a859..27c34a090c 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -20,20 +20,20 @@ module m_hypoelastic
  s_compute_damage_state
 
     real(wp), allocatable, dimension(:) :: Gs
-    $:GPU_DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create='[Gs]')
 
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    $:GPU_DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
+    $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]')
 
     real(wp), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field
-    $:GPU_DECLARE(create=["rho_K_field","G_K_field"])
+    $:GPU_DECLARE(create='[rho_K_field,G_K_field]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    $:GPU_DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
+    $:GPU_DECLARE(create='[fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h]')
 
 contains
 
@@ -55,7 +55,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:GPU_UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device='[Gs]')
 
         @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -68,16 +68,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, &
                                                       fd_number, fd_order)
-        $:GPU_UPDATE(device=["fd_coeff_x_h"])
+        $:GPU_UPDATE(device='[fd_coeff_x_h]')
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, &
                                                           fd_number, fd_order)
-            $:GPU_UPDATE(device=["fd_coeff_y_h"])
+            $:GPU_UPDATE(device='[fd_coeff_y_h]')
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, &
                                                           fd_number, fd_order)
-            $:GPU_UPDATE(device=["fd_coeff_z_h"])
+            $:GPU_UPDATE(device='[fd_coeff_z_h]')
         end if
 
     end subroutine s_initialize_hypoelastic_module
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 7adad082ca..1290b3911a 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -35,15 +35,15 @@ module m_ibm
     type(integer_field), public :: ib_markers
     type(levelset_field), public :: levelset
     type(levelset_norm_field), public :: levelset_norm
-    $:GPU_DECLARE(create=["ib_markers","levelset","levelset_norm"])
+    $:GPU_DECLARE(create='[ib_markers,levelset,levelset_norm]')
 
     type(ghost_point), dimension(:), allocatable :: ghost_points
     type(ghost_point), dimension(:), allocatable :: inner_points
-    $:GPU_DECLARE(create=["ghost_points","inner_points"])
+    $:GPU_DECLARE(create='[ghost_points,inner_points]')
 
     integer :: num_gps !< Number of ghost points
     integer :: num_inner_gps !< Number of ghost points
-    $:GPU_DECLARE(create=["gp_layers","num_gps","num_inner_gps"])
+    $:GPU_DECLARE(create='[gp_layers,num_gps,num_inner_gps]')
 
 contains
 
@@ -70,7 +70,7 @@ contains
         @:ACC_SETUP_SFs(levelset)
         @:ACC_SETUP_SFs(levelset_norm)
 
-        $:GPU_ENTER_DATA(copyin=["num_gps","num_inner_gps"])
+        $:GPU_ENTER_DATA(copyin='[num_gps,num_inner_gps]')
 
     end subroutine s_initialize_ibm_module
 
@@ -80,31 +80,31 @@ contains
 
         integer :: i, j, k
 
-        $:GPU_UPDATE(device=["ib_markers%sf"])
-        $:GPU_UPDATE(device=["levelset%sf"])
-        $:GPU_UPDATE(device=["levelset_norm%sf"])
+        $:GPU_UPDATE(device='[ib_markers%sf]')
+        $:GPU_UPDATE(device='[levelset%sf]')
+        $:GPU_UPDATE(device='[levelset_norm%sf]')
 
         ! Get neighboring IB variables from other processors
         call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
 
-        $:GPU_UPDATE(host=["ib_markers%sf"])
+        $:GPU_UPDATE(host='[ib_markers%sf]')
 
         call s_find_num_ghost_points(num_gps, num_inner_gps)
 
-        $:GPU_UPDATE(device=["num_gps", "num_inner_gps"])
+        $:GPU_UPDATE(device='[num_gps, num_inner_gps]')
         @:ALLOCATE(ghost_points(1:num_gps))
         @:ALLOCATE(inner_points(1:num_inner_gps))
 
-        $:GPU_ENTER_DATA(copyin=["ghost_points","inner_points"])
+        $:GPU_ENTER_DATA(copyin='[ghost_points,inner_points]')
 
         call s_find_ghost_points(ghost_points, inner_points)
-        $:GPU_UPDATE(device=["ghost_points", "inner_points"])
+        $:GPU_UPDATE(device='[ghost_points, inner_points]')
 
         call s_compute_image_points(ghost_points, levelset, levelset_norm)
-        $:GPU_UPDATE(device=["ghost_points"])
+        $:GPU_UPDATE(device='[ghost_points]')
 
         call s_compute_interpolation_coeffs(ghost_points)
-        $:GPU_UPDATE(device=["ghost_points"])
+        $:GPU_UPDATE(device='[ghost_points]')
 
     end subroutine s_ibm_setup
 
@@ -152,11 +152,11 @@ contains
         type(ghost_point) :: gp
         type(ghost_point) :: innerp
 
-        $:GPU_PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
-            & "alpha_IP","pres_IP","vel_IP","vel_g","vel_norm_IP","r_IP", &
-            & "v_IP","pb_IP","mv_IP","nmom_IP","presb_IP","massv_IP","rho", &
-            & "gamma","pi_inf","Re_K","G_K","Gs","gp","innerp","norm","buf", &
-            & "j","k","l","q","coeff"])
+        $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, &
+            & alpha_IP,pres_IP,vel_IP,vel_g,vel_norm_IP,r_IP, &
+            & v_IP,pb_IP,mv_IP,nmom_IP,presb_IP,massv_IP,rho, &
+            & gamma,pi_inf,Re_K,G_K,Gs,gp,innerp,norm,buf, &
+            & j,k,l,q,coeff]')
         do i = 1, num_gps
 
             gp = ghost_points(i)
@@ -300,9 +300,9 @@ contains
         end do
 
         !Correct the state of the inner points in IBs
-        $:GPU_PARALLEL_LOOP(private=["physical_loc","dyn_pres","alpha_rho_IP", &
-            & "alpha_IP","vel_g","rho","gamma","pi_inf","Re_K","innerp", &
-            & "j","k","l","q"])
+        $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, &
+            & alpha_IP,vel_g,rho,gamma,pi_inf,Re_K,innerp, &
+            & j,k,l,q]')
         do i = 1, num_inner_gps
 
             vel_g = 0._wp
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index 4f76567423..b17ec69271 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -24,12 +24,12 @@ module m_mhd
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    $:GPU_DECLARE(create=["du_dx","du_dy","du_dz","dv_dx","dv_dy","dv_dz","dw_dx","dw_dy","dw_dz"])
+    $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    $:GPU_DECLARE(create=["fd_coeff_x_h","fd_coeff_y_h","fd_coeff_z_h"])
+    $:GPU_DECLARE(create='[fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h]')
 
 contains
 
@@ -52,12 +52,12 @@ contains
 
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, fd_number, fd_order)
-        $:GPU_UPDATE(device=["fd_coeff_x_h"])
+        $:GPU_UPDATE(device='[fd_coeff_x_h]')
         call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, fd_number, fd_order)
-        $:GPU_UPDATE(device=["fd_coeff_y_h"])
+        $:GPU_UPDATE(device='[fd_coeff_y_h]')
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, fd_number, fd_order)
-            $:GPU_UPDATE(device=["fd_coeff_z_h"])
+            $:GPU_UPDATE(device='[fd_coeff_z_h]')
         end if
 
     end subroutine s_initialize_mhd_powell_module
@@ -76,7 +76,7 @@ contains
         real(wp), dimension(3) :: v, B
         real(wp) :: divB, vdotB
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["v", "B"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[v, B]')
         do q = 0, p
             do l = 0, n
                 do k = 0, m
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 4ec4c463f5..bc8e75538c 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -46,7 +46,7 @@ module m_mpi_proxy
     !> @{
     integer, private :: err_code, ierr, v_size
     !> @}
-    $:GPU_DECLARE(create=["v_size"])
+    $:GPU_DECLARE(create='[v_size]')
 
 contains
 
@@ -798,7 +798,7 @@ contains
             if (bc_x%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_x%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -834,7 +834,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send","ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send,ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -853,7 +853,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_x%beg
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = 0, n
                         do j = 0, gp_layers - 1
@@ -888,7 +888,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -908,12 +908,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 #endif
 
             ! Unpacking buffer received from bc_x%beg
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = 0, p
                 do k = 0, n
                     do j = -gp_layers, -1
@@ -929,7 +929,7 @@ contains
 
             if (bc_x%beg >= 0) then      ! PBC at the end and beginning
 
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 ! Packing buffer to be sent to bc_x%beg
                 do l = 0, p
                     do k = 0, n
@@ -966,7 +966,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
@@ -983,7 +983,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_x%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = 0, n
                         do j = m - gp_layers + 1, m
@@ -1019,7 +1019,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     call MPI_SENDRECV( &
                         ib_buff_send(0), &
@@ -1037,11 +1037,11 @@ contains
             end if
 
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 
             ! Unpacking buffer received from bc_x%end
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = 0, p
                 do k = 0, n
                     do j = m + 1, m + gp_layers
@@ -1061,7 +1061,7 @@ contains
             if (bc_y%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_y%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1098,7 +1098,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1117,7 +1117,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1154,7 +1154,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1174,12 +1174,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 #endif
 
             ! Unpacking buffer received from bc_y%beg
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = 0, p
                 do k = -gp_layers, -1
                     do j = -gp_layers, m + gp_layers
@@ -1197,7 +1197,7 @@ contains
             if (bc_y%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_y%beg
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = 0, gp_layers - 1
                         do j = -gp_layers, m + gp_layers
@@ -1234,7 +1234,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1253,7 +1253,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_y%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, p
                     do k = n - gp_layers + 1, n
                         do j = -gp_layers, m + gp_layers
@@ -1290,7 +1290,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1310,12 +1310,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 #endif
 
             ! Unpacking buffer received form bc_y%end
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = 0, p
                 do k = n + 1, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1335,7 +1335,7 @@ contains
             if (bc_z%end >= 0) then      ! PBC at the beginning and end
 
                 ! Packing buffer to be sent to bc_z%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1373,7 +1373,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1392,7 +1392,7 @@ contains
             else                        ! PBC at the beginning only
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1429,7 +1429,7 @@ contains
                 else
 #endif
 
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1449,12 +1449,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 #endif
 
             ! Unpacking buffer from bc_z%beg
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = -gp_layers, -1
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
@@ -1473,7 +1473,7 @@ contains
             if (bc_z%beg >= 0) then      ! PBC at the end and beginning
 
                 ! Packing buffer to be sent to bc_z%beg
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = 0, gp_layers - 1
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1509,7 +1509,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1528,7 +1528,7 @@ contains
             else                        ! PBC at the end only
 
                 ! Packing buffer to be sent to bc_z%end
-                $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+                $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                 do l = p - gp_layers + 1, p
                     do k = -gp_layers, n + gp_layers
                         do j = -gp_layers, m + gp_layers
@@ -1565,7 +1565,7 @@ contains
                     !$acc wait
                 else
 #endif
-                    $:GPU_UPDATE(host=["ib_buff_send"])
+                    $:GPU_UPDATE(host='[ib_buff_send]')
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
@@ -1585,12 +1585,12 @@ contains
 
 #if defined(MFC_OpenACC)
             if (rdma_mpi .eqv. .false.) then
-                $:GPU_UPDATE(device=["ib_buff_recv"])
+                $:GPU_UPDATE(device='[ib_buff_recv]')
             end if
 #endif
 
             ! Unpacking buffer received from bc_z%end
-            $:GPU_PARALLEL_LOOP(collapse=3,private=["r"])
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
             do l = p + 1, p + gp_layers
                 do k = -gp_layers, n + gp_layers
                     do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 40e56ebf9d..93361bf0a5 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -25,21 +25,21 @@ module m_qbmm
     private; public :: s_initialize_qbmm_module, s_mom_inv, s_coeff, s_compute_qbmm_rhs
 
     real(wp), allocatable, dimension(:, :, :, :, :) :: momrhs
-    $:GPU_DECLARE(create=["momrhs"])
+    $:GPU_DECLARE(create='[momrhs]')
 
     #:if MFC_CASE_OPTIMIZATION
         integer, parameter :: nterms = ${nterms}$
     #:else
         integer :: nterms
-        $:GPU_DECLARE(create=["nterms"])
+        $:GPU_DECLARE(create='[nterms]')
     #:endif
 
     type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm
-    $:GPU_DECLARE(create=["is1_qbmm","is2_qbmm","is3_qbmm"])
+    $:GPU_DECLARE(create='[is1_qbmm,is2_qbmm,is3_qbmm]')
 
     integer, allocatable, dimension(:) :: bubrs
     integer, allocatable, dimension(:, :) :: bubmoms
-    $:GPU_DECLARE(create=["bubrs","bubmoms"])
+    $:GPU_DECLARE(create='[bubrs,bubmoms]')
 
 contains
 
@@ -57,8 +57,8 @@ contains
                 nterms = 7
             end if
 
-            $:GPU_ENTER_DATA(copyin=["nterms"])
-            $:GPU_UPDATE(device=["nterms"])
+            $:GPU_ENTER_DATA(copyin='[nterms]')
+            $:GPU_UPDATE(device='[nterms]')
 
         #:endif
 
@@ -392,7 +392,7 @@ contains
             end do
         end if
 
-        $:GPU_UPDATE(device=["momrhs"])
+        $:GPU_UPDATE(device='[momrhs]')
 
         @:ALLOCATE(bubrs(1:nb))
         @:ALLOCATE(bubmoms(1:nb, 1:nmom))
@@ -400,14 +400,14 @@ contains
         do i = 1, nb
             bubrs(i) = bub_idx%rs(i)
         end do
-        $:GPU_UPDATE(device=["bubrs"])
+        $:GPU_UPDATE(device='[bubrs]')
 
         do j = 1, nmom
             do i = 1, nb
                 bubmoms(i, j) = bub_idx%moms(i, j)
             end do
         end do
-        $:GPU_UPDATE(device=["bubmoms"])
+        $:GPU_UPDATE(device='[bubmoms]')
 
     end subroutine s_initialize_qbmm_module
 
@@ -433,7 +433,7 @@ contains
         end select
 
         if (.not. polytropic) then
-            $:GPU_PARALLEL_LOOP(collapse=5,private=["nb_q","nR","nR2","R","R2","nb_dot","nR_dot","nR2_dot","var","AX"])
+            $:GPU_PARALLEL_LOOP(collapse=5,private='[nb_q,nR,nR2,R,R2,nb_dot,nR_dot,nR2_dot,var,AX]')
             do i = 1, nb
                 do q = 1, nnode
                     do l = 0, p
@@ -715,12 +715,12 @@ contains
         integer :: id1, id2, id3, i1, i2, j, q, r
 
         is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz
-        $:GPU_UPDATE(device=["is1_qbmm","is2_qbmm","is3_qbmm"])
+        $:GPU_UPDATE(device='[is1_qbmm,is2_qbmm,is3_qbmm]')
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["moms", "msum", "wght", "abscX", &
-            "abscY", "wght_pb", "wght_mv", "wght_ht", "coeff", "ht", "r", "q", &
-            "n_tait", "B_tait", "pres", "rho", "nbub", "c", "alf", "momsum", &
-            "drdt", "drdt2", "chi_vw", "x_vw", "rho_mw", "k_mw", "T_bar", "grad_T"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[moms, msum, wght, abscX, &
+            & abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, &
+            & n_tait, B_tait, pres, rho, nbub, c, alf, momsum, &
+            & drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T]')
         do id3 = is3_qbmm%beg, is3_qbmm%end
             do id2 = is2_qbmm%beg, is2_qbmm%end
                 do id1 = is1_qbmm%beg, is1_qbmm%end
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 416f9ca5bc..f76c81f4ed 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -75,13 +75,13 @@ module m_rhs
     !! conservative variables, which are located in q_cons_vf, at cell-interior
     !! Gaussian quadrature points (QP).
     type(vector_field) :: q_cons_qp !<
-    $:GPU_DECLARE(create=["q_cons_qp"])
+    $:GPU_DECLARE(create='[q_cons_qp]')
 
     !! The primitive variables at cell-interior Gaussian quadrature points. These
     !! are calculated from the conservative variables and gradient magnitude (GM)
     !! of the volume fractions, q_cons_qp and gm_alpha_qp, respectively.
     type(vector_field) :: q_prim_qp !<
-    $:GPU_DECLARE(create=["q_prim_qp"])
+    $:GPU_DECLARE(create='[q_prim_qp]')
 
     !> @name The first-order spatial derivatives of the primitive variables at cell-
     !! interior Gaussian quadrature points. These are WENO-reconstructed from
@@ -90,7 +90,7 @@ module m_rhs
     !! of the primitive variables, located in qK_prim_n, where K = L or R.
     !> @{
     type(vector_field), allocatable, dimension(:) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
-    $:GPU_DECLARE(create=["dq_prim_dx_qp","dq_prim_dy_qp","dq_prim_dz_qp"])
+    $:GPU_DECLARE(create='[dq_prim_dx_qp,dq_prim_dy_qp,dq_prim_dz_qp]')
     !> @}
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
@@ -100,26 +100,26 @@ module m_rhs
     !> @{
     type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n
     type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n
-    $:GPU_DECLARE(create=["dqL_prim_dx_n","dqL_prim_dy_n","dqL_prim_dz_n"])
-    $:GPU_DECLARE(create=["dqR_prim_dx_n","dqR_prim_dy_n","dqR_prim_dz_n"])
+    $:GPU_DECLARE(create='[dqL_prim_dx_n,dqL_prim_dy_n,dqL_prim_dz_n]')
+    $:GPU_DECLARE(create='[dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n]')
     !> @}
 
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
-    $:GPU_DECLARE(create=["tau_Re_vf"])
+    $:GPU_DECLARE(create='[tau_Re_vf]')
 
     type(vector_field) :: gm_alpha_qp  !<
     !! The gradient magnitude of the volume fractions at cell-interior Gaussian
     !! quadrature points. gm_alpha_qp is calculated from individual first-order
     !! spatial derivatives located in dq_prim_ds_qp.
 
-    $:GPU_DECLARE(create=["gm_alpha_qp"])
+    $:GPU_DECLARE(create='[gm_alpha_qp]')
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
     !! average gradient magnitude of volume fractions, located in gm_alpha_qp.
     !> @{
     type(vector_field), allocatable, dimension(:) :: gm_alphaL_n
     type(vector_field), allocatable, dimension(:) :: gm_alphaR_n
-    $:GPU_DECLARE(create=["gm_alphaL_n","gm_alphaR_n"])
+    $:GPU_DECLARE(create='[gm_alphaL_n,gm_alphaR_n]')
     !> @}
 
     !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical
@@ -129,38 +129,38 @@ module m_rhs
     type(vector_field), allocatable, dimension(:) :: flux_n
     type(vector_field), allocatable, dimension(:) :: flux_src_n
     type(vector_field), allocatable, dimension(:) :: flux_gsrc_n
-    $:GPU_DECLARE(create=["flux_n","flux_src_n","flux_gsrc_n"])
+    $:GPU_DECLARE(create='[flux_n,flux_src_n,flux_gsrc_n]')
     !> @}
 
     type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim
-    $:GPU_DECLARE(create=["qL_prim","qR_prim"])
+    $:GPU_DECLARE(create='[qL_prim,qR_prim]')
 
     type(int_bounds_info) :: iv !< Vector field indical bounds
-    $:GPU_DECLARE(create=["iv"])
+    $:GPU_DECLARE(create='[iv]')
 
     !> @name Indical bounds in the x-, y- and z-directions
     !> @{
     type(int_bounds_info) :: irx, iry, irz
-    $:GPU_DECLARE(create=["irx","iry","irz"])
+    $:GPU_DECLARE(create='[irx,iry,irz]')
 
     type(int_bounds_info) :: is1, is2, is3
-    $:GPU_DECLARE(create=["is1","is2","is3"])
+    $:GPU_DECLARE(create='[is1,is2,is3]')
 
     !> @name Saved fluxes for testing
     !> @{
     type(scalar_field) :: alf_sum
     !> @}
-    $:GPU_DECLARE(create=["alf_sum"])
+    $:GPU_DECLARE(create='[alf_sum]')
 
     real(wp), allocatable, dimension(:, :, :) :: blkmod1, blkmod2, alpha1, alpha2, Kterm
     real(wp), allocatable, dimension(:, :, :, :) :: qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf
     real(wp), allocatable, dimension(:, :, :, :) :: dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf
-    $:GPU_DECLARE(create=["blkmod1","blkmod2","alpha1","alpha2","Kterm"])
-    $:GPU_DECLARE(create=["qL_rsx_vf","qL_rsy_vf","qL_rsz_vf","qR_rsx_vf","qR_rsy_vf","qR_rsz_vf"])
-    $:GPU_DECLARE(create=["dqL_rsx_vf","dqL_rsy_vf","dqL_rsz_vf","dqR_rsx_vf","dqR_rsy_vf","dqR_rsz_vf"])
+    $:GPU_DECLARE(create='[blkmod1,blkmod2,alpha1,alpha2,Kterm]')
+    $:GPU_DECLARE(create='[qL_rsx_vf,qL_rsy_vf,qL_rsz_vf,qR_rsx_vf,qR_rsy_vf,qR_rsz_vf]')
+    $:GPU_DECLARE(create='[dqL_rsx_vf,dqL_rsy_vf,dqL_rsz_vf,dqR_rsx_vf,dqR_rsy_vf,dqR_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :) :: nbub !< Bubble number density
-    $:GPU_DECLARE(create=["nbub"])
+    $:GPU_DECLARE(create='[nbub]')
 
 contains
 
@@ -173,8 +173,8 @@ contains
 
         integer :: num_eqns_after_adv
 
-        $:GPU_ENTER_DATA(copyin=["idwbuff","idwbuff"])
-        $:GPU_UPDATE(device=["idwbuff", "idwbuff"])
+        $:GPU_ENTER_DATA(copyin='[idwbuff,idwbuff]')
+        $:GPU_UPDATE(device='[idwbuff, idwbuff]')
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
         @:ALLOCATE(q_prim_qp%vf(1:sys_size))
@@ -201,29 +201,29 @@ contains
                 @:ALLOCATE(q_prim_qp%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
             else
                 q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-                $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(l)%sf"])
-                $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(l)%sf"])
+                $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(l)%sf]')
+                $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(l)%sf]')
             end if
         end do
 
         do l = adv_idx%beg, adv_idx%end
             q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(l)%sf"])
-            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(l)%sf"])
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(l)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(l)%sf]')
         end do
 
         if (surface_tension) then
             q_prim_qp%vf(c_idx)%sf => &
                 q_cons_qp%vf(c_idx)%sf
-            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(c_idx)%sf"])
-            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(c_idx)%sf"])
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(c_idx)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(c_idx)%sf]')
         end if
 
         if (cont_damage) then
             q_prim_qp%vf(damage_idx)%sf => &
                 q_cons_qp%vf(damage_idx)%sf
-            $:GPU_ENTER_DATA(copyin=["q_prim_qp%vf(damage_idx)%sf"])
-            $:GPU_ENTER_DATA(attach=["q_prim_qp%vf(damage_idx)%sf"])
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(damage_idx)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(damage_idx)%sf]')
         end if
 
         if (viscous) then
@@ -548,14 +548,14 @@ contains
                 if (riemann_solver /= 1 .and. riemann_solver /= 4) then
                     do l = adv_idx%beg + 1, adv_idx%end
                         flux_src_n(i)%vf(l)%sf => flux_src_n(i)%vf(adv_idx%beg)%sf
-                        $:GPU_ENTER_DATA(attach=["flux_src_n(i)%vf(l)%sf"])
+                        $:GPU_ENTER_DATA(attach='[flux_src_n(i)%vf(l)%sf]')
                     end do
                 end if
             else
                 do l = 1, sys_size
                     flux_n(i)%vf(l)%sf => flux_n(1)%vf(l)%sf
                     flux_src_n(i)%vf(l)%sf => flux_src_n(1)%vf(l)%sf
-                    $:GPU_ENTER_DATA(attach=["flux_n(i)%vf(l)%sf","flux_src_n(i)%vf(l)%sf"])
+                    $:GPU_ENTER_DATA(attach='[flux_n(i)%vf(l)%sf,flux_src_n(i)%vf(l)%sf]')
                 end do
             end if
         end do
@@ -978,7 +978,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
             do j = 1, sys_size
                 do q_loop = 0, p
                     do l_loop = 0, n
@@ -993,8 +993,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
-                    & "pressure_val","flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do q_loop = 0, p
                     do l_loop = 0, n
                         do k_loop = 0, m
@@ -1023,7 +1023,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
             do j = 1, sys_size
                 do l = 0, p
                     do k = 0, n
@@ -1038,8 +1038,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
-                    & "pressure_val","flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do l = 0, p
                     do k = 0, n
                         do q = 0, m
@@ -1064,7 +1064,7 @@ contains
             end if
 
             if (cyl_coord) then
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do l = 0, p
                         do k = 0, n
@@ -1090,8 +1090,8 @@ contains
             end if
 
             if (grid_geometry == 3) then ! Cylindrical Coordinates
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","velocity_val", &
-                    & "flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,velocity_val, &
+                    & flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1106,7 +1106,7 @@ contains
                         end do
                     end do
                 end do
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1120,7 +1120,7 @@ contains
                     end do
                 end do
             else ! Cartesian Coordinates
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1136,8 +1136,8 @@ contains
             end if
 
             if (model_eqns == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4,private=["inv_ds","advected_qty_val", &
-                    & "pressure_val","flux_face1","flux_face2"])
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do k = 0, p
                     do q = 0, n
                         do l = 0, m
@@ -1181,8 +1181,8 @@ contains
             case (1) ! x-direction
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                        & "local_term_coeff","local_flux1","local_flux2"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do q_idx = 0, p ! z_extent
                             do l_idx = 0, n ! y_extent
@@ -1200,9 +1200,9 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, local_flux2]')
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx)
@@ -1214,10 +1214,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds",&
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", &
-                                "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds,&
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx)
@@ -1230,8 +1230,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                            & "local_term_coeff","local_flux1","local_flux2"])
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
@@ -1248,8 +1248,8 @@ contains
             case (2) ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                        & "local_term_coeff","local_flux1","local_flux2"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do l_idx = 0, p ! z_extent
                             do k_idx = 0, n ! y_extent
@@ -1267,10 +1267,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", &
-                                "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx)
@@ -1286,10 +1286,10 @@ contains
                                         end if
                                     end do; end do; end do
 
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", &
-                                "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx)
@@ -1306,8 +1306,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                            & "local_term_coeff","local_flux1","local_flux2"])
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
@@ -1329,8 +1329,8 @@ contains
                 end if
 
                 if (use_standard_riemann) then
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                        & "local_term_coeff","local_flux1","local_flux2"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do k_idx = 0, p ! z_extent
                             do q_idx = 0, n ! y_extent
@@ -1348,10 +1348,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", &
-                                "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx)
@@ -1363,10 +1363,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            $:GPU_PARALLEL_LOOP(collapse=3, private=["local_inv_ds", &
-                                "local_q_cons_val", "local_k_term_val", &
-                                "local_term_coeff", "local_flux1", &
-                                "local_flux2"])
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx)
@@ -1379,8 +1379,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        $:GPU_PARALLEL_LOOP(collapse=4,private=["local_inv_ds", &
-                            & "local_term_coeff","local_flux1","local_flux2"])
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
@@ -1729,7 +1729,7 @@ contains
 
         end if
 
-        $:GPU_UPDATE(device=["is1","is2","is3","iv"])
+        $:GPU_UPDATE(device='[is1,is2,is3,iv]')
 
         if (recon_dir == 1) then
             $:GPU_PARALLEL_LOOP(collapse=4)
@@ -1787,13 +1787,13 @@ contains
                 @:DEALLOCATE(q_cons_qp%vf(j)%sf)
                 @:DEALLOCATE(q_prim_qp%vf(j)%sf)
             else
-                $:GPU_EXIT_DATA(detach=["q_prim_qp%vf(j)%sf"])
+                $:GPU_EXIT_DATA(detach='[q_prim_qp%vf(j)%sf]')
                 nullify (q_prim_qp%vf(j)%sf)
             end if
         end do
 
         do j = adv_idx%beg, adv_idx%end
-            $:GPU_EXIT_DATA(detach=["q_prim_qp%vf(j)%sf"])
+            $:GPU_EXIT_DATA(detach='[q_prim_qp%vf(j)%sf]')
             nullify (q_prim_qp%vf(j)%sf)
         end do
 
@@ -1826,7 +1826,7 @@ contains
         end if
 
         if (mpp_lim .and. bubbles_euler) then
-            $:GPU_EXIT_DATA(delete=["alf_sum%sf"])
+            $:GPU_EXIT_DATA(delete='[alf_sum%sf]')
             deallocate (alf_sum%sf)
         end if
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index a55b0c144c..e1eb2ed76d 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -65,7 +65,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
-    $:GPU_DECLARE(create=["flux_rsx_vf","flux_src_rsx_vf","flux_rsy_vf","flux_src_rsy_vf","flux_rsz_vf","flux_src_rsz_vf"])
+    $:GPU_DECLARE(create='[flux_rsx_vf,flux_src_rsx_vf,flux_rsy_vf,flux_src_rsy_vf,flux_rsz_vf,flux_src_rsz_vf]')
     !> @}
 
     !> The cell-boundary values of the geometrical source flux that are computed
@@ -76,7 +76,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
-    $:GPU_DECLARE(create=["flux_gsrc_rsx_vf","flux_gsrc_rsy_vf","flux_gsrc_rsz_vf"])
+    $:GPU_DECLARE(create='[flux_gsrc_rsx_vf,flux_gsrc_rsy_vf,flux_gsrc_rsz_vf]')
     !> @}
 
     ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
@@ -85,17 +85,17 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
-    $:GPU_DECLARE(create=["vel_src_rsx_vf","vel_src_rsy_vf","vel_src_rsz_vf"])
+    $:GPU_DECLARE(create='[vel_src_rsx_vf,vel_src_rsy_vf,vel_src_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
-    $:GPU_DECLARE(create=["mom_sp_rsx_vf","mom_sp_rsy_vf","mom_sp_rsz_vf"])
+    $:GPU_DECLARE(create='[mom_sp_rsx_vf,mom_sp_rsy_vf,mom_sp_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
-    $:GPU_DECLARE(create=["Re_avg_rsx_vf","Re_avg_rsy_vf","Re_avg_rsz_vf"])
+    $:GPU_DECLARE(create='[Re_avg_rsx_vf,Re_avg_rsy_vf,Re_avg_rsz_vf]')
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
@@ -103,13 +103,13 @@ module m_riemann_solvers
     type(int_bounds_info) :: isx, isy, isz
     !> @}
 
-    $:GPU_DECLARE(create=["is1","is2","is3","isx","isy","isz"])
+    $:GPU_DECLARE(create='[is1,is2,is3,isx,isy,isz]')
 
     real(wp), allocatable, dimension(:) :: Gs
-    $:GPU_DECLARE(create=["Gs"])
+    $:GPU_DECLARE(create='[Gs]')
 
     real(wp), allocatable, dimension(:, :) :: Res
-    $:GPU_DECLARE(create=["Res"])
+    $:GPU_DECLARE(create='[Res]')
 
 contains
 
@@ -354,14 +354,14 @@ contains
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
-                $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", "alpha_rho_R", &
-                    "vel_L", "vel_R", "alpha_L", "alpha_R", "tau_e_L", "tau_e_R", &
-                    "G_L", "G_R", "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
-                    "s_L", "s_R", "s_S", "Ys_L", "Ys_R", "xi_field_L", "xi_field_R", &
-                    "Cp_iL", "Cp_iR", "Xs_L", "Xs_R", "Gamma_iL", "Gamma_iR", &
-                    "Yi_avg", "Phi_avg", "h_iL", "h_iR", "h_avg_2", "c_fast", &
-                    "pres_mag", "B", "Ga", "vdotB", "B2", "b4", "cm", "pcorr", &
-                    "zcoef", "vel_L_tmp", "vel_R_tmp"])
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, &
+                    & vel_L, vel_R, alpha_L, alpha_R, tau_e_L, tau_e_R, &
+                    & G_L, G_R, Re_L, Re_R, rho_avg, h_avg, gamma_avg, &
+                    & s_L, s_R, s_S, Ys_L, Ys_R, xi_field_L, xi_field_R, &
+                    & Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, &
+                    & Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, &
+                    & pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, &
+                    & zcoef, vel_L_tmp, vel_R_tmp]')
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
@@ -1283,14 +1283,14 @@ contains
                 ! 6-EQUATION MODEL WITH HLLC
                 if (model_eqns == 3) then
                     !ME3
-                    $:GPU_PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
-                        "vel_K_Star", "Re_L", "Re_R", "rho_avg", "h_avg", &
-                        "gamma_avg", "s_L", "s_R", "s_S", "vel_avg_rms", &
-                        "alpha_L", "alpha_R", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
-                        "Gamma_iL", "Gamma_iR", "Cp_iL", "Cp_iR", "Yi_avg", &
-                        "Phi_avg", "h_iL", "h_iR", "h_avg_2", "tau_e_L", &
-                        "tau_e_R", "G_L", "G_R", "flux_ene_e", "xi_field_L", &
-                        "xi_field_R", "pcorr", "zcoef", "vel_L_tmp", "vel_R_tmp"])
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, &
+                        & vel_K_Star, Re_L, Re_R, rho_avg, h_avg, &
+                        & gamma_avg, s_L, s_R, s_S, vel_avg_rms, &
+                        & alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, &
+                        & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, &
+                        & Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, &
+                        & tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, &
+                        & xi_field_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -1716,10 +1716,10 @@ contains
 
                 elseif (model_eqns == 4) then
                     !ME4
-                    $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
-                        "alpha_rho_R", "vel_L", "vel_R", "alpha_L", "alpha_R", &
-                        "rho_avg", "h_avg", "gamma_avg", "s_L", "s_R", "s_S", &
-                        "vel_avg_rms", "nbub_L", "nbub_R", "ptilde_L", "ptilde_R"])
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, &
+                        & alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, &
+                        & rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, &
+                        & vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -1969,12 +1969,12 @@ contains
                     !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
-                    $:GPU_PARALLEL_LOOP(collapse=3, private=["R0_L", "R0_R", "V0_L", &
-                        "V0_R", "P0_L", "P0_R", "pbw_L", "pbw_R", "vel_L", &
-                        "vel_R", "rho_avg", "alpha_L", "alpha_R", "h_avg", &
-                        "gamma_avg", "s_L", "s_R", "s_S", "nbub_L", "nbub_R", &
-                        "ptilde_L", "ptilde_R", "vel_avg_rms", "Re_L", "Re_R", &
-                        "pcorr", "zcoef", "vel_L_tmp", "vel_R_tmp"])
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[R0_L, R0_R, V0_L, &
+                        & V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, &
+                        & vel_R, rho_avg, alpha_L, alpha_R, h_avg, &
+                        & gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, &
+                        & ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, &
+                        & pcorr, zcoef, vel_L_tmp, vel_R_tmp]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -2440,15 +2440,15 @@ contains
                     !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
-                    $:GPU_PARALLEL_LOOP(collapse=3, private=["vel_L", "vel_R", &
-                        "Re_L", "Re_R", "rho_avg", "h_avg", "gamma_avg", &
-                        "alpha_L", "alpha_R", "s_L", "s_R", "s_S", &
-                        "vel_avg_rms", "pcorr", "zcoef", "vel_L_tmp", &
-                        "vel_R_tmp", "Ys_L", "Ys_R", "Xs_L", "Xs_R", &
-                        "Gamma_iL", "Gamma_iR", "Cp_iL", "Cp_iR", "tau_e_L", &
-                        "tau_e_R", "xi_field_L", "xi_field_R", "Yi_avg", &
-                        "Phi_avg", "h_iL", "h_iR", "h_avg_2"], &
-                        copyin=["is1", "is2", "is3"])
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, &
+                        & Re_L, Re_R, rho_avg, h_avg, gamma_avg, &
+                        & alpha_L, alpha_R, s_L, s_R, s_S, &
+                        & vel_avg_rms, pcorr, zcoef, vel_L_tmp, &
+                        & vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, &
+                        & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, &
+                        & tau_e_R, xi_field_L, xi_field_R, Yi_avg, &
+                        & Phi_avg, h_iL, h_iR, h_avg_2]', &
+                        & copyin='[is1, is2, is3]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -3065,12 +3065,12 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_rho_L", &
-                    "alpha_rho_R", "vel", "alpha_L", "alpha_R", "rho", "pres", &
-                    "E", "H_no_mag", "gamma", "pi_inf", "qv", "vel_rms", "B", &
-                    "c", "c_fast", "pres_mag", "U_L", "U_R", "U_starL", &
-                    "U_starR", "U_doubleL", "U_doubleR", "F_L", "F_R", &
-                    "F_starL", "F_starR", "F_hlld"])
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, &
+                    & alpha_rho_R, vel, alpha_L, alpha_R, rho, pres, &
+                    & E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, &
+                    & c, c_fast, pres_mag, U_L, U_R, U_starL, &
+                    & U_starR, U_doubleL, U_doubleR, F_L, F_R, &
+                    & F_starL, F_starR, F_hlld]')
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
@@ -3322,7 +3322,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        $:GPU_UPDATE(device=["Gs"])
+        $:GPU_UPDATE(device='[Gs]')
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -3334,10 +3334,10 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            $:GPU_UPDATE(device=["Res","Re_idx","Re_size"])
+            $:GPU_UPDATE(device='[Res,Re_idx,Re_size]')
         end if
 
-        $:GPU_ENTER_DATA(copyin=["is1","is2","is3","isx","isy","isz"])
+        $:GPU_ENTER_DATA(copyin='[is1,is2,is3,isx,isy,isz]')
 
         is1%beg = -1; is2%beg = 0; is3%beg = 0
         is1%end = m; is2%end = n; is3%end = p
@@ -3480,7 +3480,7 @@ contains
             dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
         end if
 
-        $:GPU_UPDATE(device=["is1","is2","is3"])
+        $:GPU_UPDATE(device='[is1,is2,is3]')
 
         if (elasticity) then
             if (norm_dir == 1) then
@@ -3494,9 +3494,9 @@ contains
 
         isx = ix; isy = iy; isz = iz
         ! for stuff in the same module
-        $:GPU_UPDATE(device=["isx","isy","isz"])
+        $:GPU_UPDATE(device='[isx,isy,isz]')
         ! for stuff in different modules
-        $:GPU_UPDATE(device=["dir_idx","dir_flg","dir_idx_tau"])
+        $:GPU_UPDATE(device='[dir_idx,dir_flg,dir_idx_tau]')
 
         ! Population of Buffers in x-direction
         if (norm_dir == 1) then
@@ -3984,10 +3984,10 @@ contains
         integer :: i_vel             !!< Loop iterator for velocity components.
         integer :: idx_rp(3)         !!< Indices $(j,k,l)$ of 'right' point for averaging.
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["idx_rp", "avg_v_int", &
-            "avg_dvdx_int", "avg_dvdy_int", "avg_dvdz_int", "Re_s", "Re_b", &
-            "vel_src_int", "r_eff", "divergence_cyl", "stress_vector_shear", &
-            "stress_normal_bulk", "div_v_term_const"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, &
+            & avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, &
+            & vel_src_int, r_eff, divergence_cyl, stress_vector_shear, &
+            & stress_normal_bulk, div_v_term_const]')
         do l = iz%beg, iz%end
             do k = iy%beg, iy%end
                 do j = ix%beg, ix%end
@@ -4147,9 +4147,9 @@ contains
 
         real(wp) :: divergence_v   !< Velocity divergence at interface.
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["idx_right_phys", "vel_grad_avg", &
-            "current_tau_shear", "current_tau_bulk", "vel_src_at_interface", &
-            "Re_shear", "Re_bulk", "divergence_v", "i_dim", "vel_comp_idx"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, &
+            & current_tau_shear, current_tau_bulk, vel_src_at_interface, &
+            & Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]')
         do l_loop = isz%beg, isz%end
             do k_loop = isy%beg, isy%end
                 do j_loop = isx%beg, isx%end
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 9be0c31c2b..1fed473b65 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -276,17 +276,17 @@ contains
 
         bc_type(1, -1)%sf(:, :, :) = bc_x%beg
         bc_type(1, 1)%sf(:, :, :) = bc_x%end
-        $:GPU_UPDATE(device=["bc_type(1,-1)%sf","bc_type(1,1)%sf"])
+        $:GPU_UPDATE(device='[bc_type(1,-1)%sf,bc_type(1,1)%sf]')
 
         if (n > 0) then
             bc_type(2, -1)%sf(:, :, :) = bc_y%beg
             bc_type(2, 1)%sf(:, :, :) = bc_y%end
-            $:GPU_UPDATE(device=["bc_type(2,-1)%sf","bc_type(2,1)%sf"])
+            $:GPU_UPDATE(device='[bc_type(2,-1)%sf,bc_type(2,1)%sf]')
 
             if (p > 0) then
                 bc_type(3, -1)%sf(:, :, :) = bc_z%beg
                 bc_type(3, 1)%sf(:, :, :) = bc_z%end
-                $:GPU_UPDATE(device=["bc_type(3,-1)%sf","bc_type(3,1)%sf"])
+                $:GPU_UPDATE(device='[bc_type(3,-1)%sf,bc_type(3,1)%sf]')
             end if
         end if
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index a89d882b82..5bf26ad81f 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1263,12 +1263,12 @@ contains
         if (cfl_dt) then
             if ((mytime + dt) >= t_stop) then
                 dt = t_stop - mytime
-                $:GPU_UPDATE(device=["dt"])
+                $:GPU_UPDATE(device='[dt]')
             end if
         else
             if ((mytime + dt) >= finaltime) then
                 dt = finaltime - mytime
-                $:GPU_UPDATE(device=["dt"])
+                $:GPU_UPDATE(device='[dt]')
             end if
         end if
 
@@ -1292,7 +1292,7 @@ contains
 
         if (probe_wrt) then
             do i = 1, sys_size
-                $:GPU_UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
+                $:GPU_UPDATE(host='[q_cons_ts(1)%vf(i)%sf]')
             end do
         end if
 
@@ -1395,7 +1395,7 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 1, sys_size
-            $:GPU_UPDATE(host=["q_cons_ts(1)%vf(i)%sf"])
+            $:GPU_UPDATE(host='[q_cons_ts(1)%vf(i)%sf]')
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1409,8 +1409,8 @@ contains
         end do
 
         if (qbmm .and. .not. polytropic) then
-            $:GPU_UPDATE(host=["pb_ts(1)%sf"])
-            $:GPU_UPDATE(host=["mv_ts(1)%sf"])
+            $:GPU_UPDATE(host='[pb_ts(1)%sf]')
+            $:GPU_UPDATE(host='[mv_ts(1)%sf]')
         end if
 
         if (cfl_dt) then
@@ -1420,16 +1420,16 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            $:GPU_UPDATE(host=["intfc_rad"])
+            $:GPU_UPDATE(host='[intfc_rad]')
             do i = 1, nBubs
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
             end do
 
-            $:GPU_UPDATE(host=["q_beta%vf(1)%sf"])
+            $:GPU_UPDATE(host='[q_beta%vf(1)%sf]')
             call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, q_beta%vf(1))
-            $:GPU_UPDATE(host=["Rmax_stats","Rmin_stats","gas_p","gas_mv","intfc_vel"])
+            $:GPU_UPDATE(host='[Rmax_stats,Rmin_stats,gas_p,gas_mv,intfc_vel]')
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
@@ -1626,44 +1626,44 @@ contains
         integer :: i
         !Update GPU DATA
         do i = 1, sys_size
-            $:GPU_UPDATE(device=["q_cons_ts(1)%vf(i)%sf"])
+            $:GPU_UPDATE(device='[q_cons_ts(1)%vf(i)%sf]')
         end do
 
         if (qbmm .and. .not. polytropic) then
-            $:GPU_UPDATE(device=["pb_ts(1)%sf","mv_ts(1)%sf"])
+            $:GPU_UPDATE(device='[pb_ts(1)%sf,mv_ts(1)%sf]')
         end if
         if (chemistry) then
-            $:GPU_UPDATE(device=["q_T_sf%sf"])
+            $:GPU_UPDATE(device='[q_T_sf%sf]')
         end if
-        $:GPU_UPDATE(device=["nb","R0ref","Ca","Web","Re_inv","weight","R0","V0", &
-            & "bubbles_euler","polytropic","polydisperse","qbmm","R0_type", &
-            & "ptil","bubble_model","thermal","poly_sigma","adv_n","adap_dt", &
-            & "adap_dt_tol","n_idx","pi_fac","low_Mach"])
-        $:GPU_UPDATE(device=["R_n","R_v","phi_vn","phi_nv","Pe_c","Tw","pv","M_n", &
-            & "M_v","k_n","k_v","pb0","mass_n0","mass_v0","Pe_T","Re_trans_T", &
-            & "Re_trans_c","Im_trans_T","Im_trans_c","omegaN","mul0","ss", &
-            & "gamma_v","mu_v","gamma_m","gamma_n","mu_n","gam"])
+        $:GPU_UPDATE(device='[nb,R0ref,Ca,Web,Re_inv,weight,R0,V0, &
+            & bubbles_euler,polytropic,polydisperse,qbmm,R0_type, &
+            & ptil,bubble_model,thermal,poly_sigma,adv_n,adap_dt, &
+            & adap_dt_tol,n_idx,pi_fac,low_Mach]')
+        $:GPU_UPDATE(device='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw,pv,M_n, &
+            & M_v,k_n,k_v,pb0,mass_n0,mass_v0,Pe_T,Re_trans_T, &
+            & Re_trans_c,Im_trans_T,Im_trans_c,omegaN,mul0,ss, &
+            & gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
 
-        $:GPU_UPDATE(device=["acoustic_source", "num_source"])
-        $:GPU_UPDATE(device=["sigma", "surface_tension"])
+        $:GPU_UPDATE(device='[acoustic_source, num_source]')
+        $:GPU_UPDATE(device='[sigma, surface_tension]')
 
-        $:GPU_UPDATE(device=["dx","dy","dz","x_cb","x_cc","y_cb","y_cc","z_cb","z_cc"])
+        $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]')
 
-        $:GPU_UPDATE(device=["bc_x%vb1","bc_x%vb2","bc_x%vb3","bc_x%ve1","bc_x%ve2","bc_x%ve3"])
-        $:GPU_UPDATE(device=["bc_y%vb1","bc_y%vb2","bc_y%vb3","bc_y%ve1","bc_y%ve2","bc_y%ve3"])
-        $:GPU_UPDATE(device=["bc_z%vb1","bc_z%vb2","bc_z%vb3","bc_z%ve1","bc_z%ve2","bc_z%ve3"])
+        $:GPU_UPDATE(device='[bc_x%vb1,bc_x%vb2,bc_x%vb3,bc_x%ve1,bc_x%ve2,bc_x%ve3]')
+        $:GPU_UPDATE(device='[bc_y%vb1,bc_y%vb2,bc_y%vb3,bc_y%ve1,bc_y%ve2,bc_y%ve3]')
+        $:GPU_UPDATE(device='[bc_z%vb1,bc_z%vb2,bc_z%vb3,bc_z%ve1,bc_z%ve2,bc_z%ve3]')
 
-        $:GPU_UPDATE(device=["bc_x%grcbc_in","bc_x%grcbc_out","bc_x%grcbc_vel_out"])
-        $:GPU_UPDATE(device=["bc_y%grcbc_in","bc_y%grcbc_out","bc_y%grcbc_vel_out"])
-        $:GPU_UPDATE(device=["bc_z%grcbc_in","bc_z%grcbc_out","bc_z%grcbc_vel_out"])
+        $:GPU_UPDATE(device='[bc_x%grcbc_in,bc_x%grcbc_out,bc_x%grcbc_vel_out]')
+        $:GPU_UPDATE(device='[bc_y%grcbc_in,bc_y%grcbc_out,bc_y%grcbc_vel_out]')
+        $:GPU_UPDATE(device='[bc_z%grcbc_in,bc_z%grcbc_out,bc_z%grcbc_vel_out]')
 
-        $:GPU_UPDATE(device=["relax", "relax_model"])
+        $:GPU_UPDATE(device='[relax, relax_model]')
         if (relax) then
-            $:GPU_UPDATE(device=["palpha_eps", "ptgalpha_eps"])
+            $:GPU_UPDATE(device='[palpha_eps, ptgalpha_eps]')
         end if
 
         if (ib) then
-            $:GPU_UPDATE(device=["ib_markers%sf"])
+            $:GPU_UPDATE(device='[ib_markers%sf]')
         end if
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index f6181237ee..fd84e50664 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -29,16 +29,16 @@ module m_surface_tension
     !> @{
     type(scalar_field), allocatable, dimension(:) :: c_divs
     !> @)
-    $:GPU_DECLARE(create=["c_divs"])
+    $:GPU_DECLARE(create='[c_divs]')
 
     !> @name cell boundary reconstructed gradient components and magnitude
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: gL_x, gR_x, gL_y, gR_y, gL_z, gR_z
     !> @}
-    $:GPU_DECLARE(create=["gL_x","gR_x","gL_y","gR_y","gL_z","gR_z"])
+    $:GPU_DECLARE(create='[gL_x,gR_x,gL_y,gR_y,gL_z,gR_z]')
 
     type(int_bounds_info) :: is1, is2, is3, iv
-    $:GPU_DECLARE(create=["is1","is2","is3","iv"])
+    $:GPU_DECLARE(create='[is1,is2,is3,iv]')
 
 contains
 
@@ -85,9 +85,9 @@ contains
         integer :: j, k, l, i
 
         if (id == 1) then
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
-                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", &
-                "normWR", "normW"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, &
+                & normWR, normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -132,9 +132,9 @@ contains
 
         elseif (id == 2) then
 
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
-                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
-                "normW"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, &
+                & normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -179,9 +179,9 @@ contains
 
         elseif (id == 3) then
 
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["Omega", "w1L", "w2L", "w3L", &
-                "w1R", "w2R", "w3R", "w1", "w2", "w3", "normWL", "normWR", &
-                "normW"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, &
+                & normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -335,7 +335,7 @@ contains
 
         end if
 
-        $:GPU_UPDATE(device=["is1","is2","is3","iv"])
+        $:GPU_UPDATE(device='[is1,is2,is3,iv]')
 
         if (recon_dir == 1) then
             $:GPU_PARALLEL_LOOP(collapse=4)
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 658f5abce8..570552a405 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -77,7 +77,7 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    $:GPU_DECLARE(create=["q_cons_ts","q_prim_vf","q_T_sf","rhs_vf","q_prim_ts","rhs_mv","rhs_pb","max_dt"])
+    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
 contains
 
@@ -948,7 +948,7 @@ contains
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
-                    $:GPU_UPDATE(host=["gas_p","gas_mv","intfc_rad","intfc_vel"])
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
                 call s_write_void_evol(mytime)
@@ -981,7 +981,7 @@ contains
             q_prim_vf, &
             idwint)
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private=["vel", "alpha", "Re"])
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -1005,7 +1005,7 @@ contains
             call s_mpi_allreduce_min(dt_local, dt)
         end if
 
-        $:GPU_UPDATE(device=["dt"])
+        $:GPU_UPDATE(device='[dt]')
 
     end subroutine s_compute_dt
 
@@ -1050,7 +1050,7 @@ contains
         integer :: i !< Generic loop iterator
 
         do i = 1, sys_size
-            $:GPU_UPDATE(host=["q_prim_vf(i)%sf"])
+            $:GPU_UPDATE(host='[q_prim_vf(i)%sf]')
         end do
 
         if (t_step == t_step_start) then
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index da65b0d0a0..d65c9bb029 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -24,10 +24,10 @@ module m_viscous
 
     type(int_bounds_info) :: iv
     type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
-    $:GPU_DECLARE(create=["is1_viscous","is2_viscous","is3_viscous","iv"])
+    $:GPU_DECLARE(create='[is1_viscous,is2_viscous,is3_viscous,iv]')
 
     real(wp), allocatable, dimension(:, :) :: Res_viscous
-    $:GPU_DECLARE(create=["Res_viscous"])
+    $:GPU_DECLARE(create='[Res_viscous]')
 
 contains
 
@@ -42,8 +42,8 @@ contains
                 Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
             end do
         end do
-        $:GPU_UPDATE(device=["Res_viscous","Re_idx","Re_size"])
-        $:GPU_ENTER_DATA(copyin=["is1_viscous","is2_viscous","is3_viscous","iv"])
+        $:GPU_UPDATE(device='[Res_viscous,Re_idx,Re_size]')
+        $:GPU_ENTER_DATA(copyin='[is1_viscous,is2_viscous,is3_viscous,iv]')
 
     end subroutine s_initialize_viscous_module
 
@@ -75,7 +75,7 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
@@ -89,8 +89,8 @@ contains
             end do
         end do
         if (shear_stress) then    ! Shear stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
-                "alpha_rho_visc", "Re_visc", "tau_Re"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -197,8 +197,8 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
-                "alpha_rho_visc", "Re_visc", "tau_Re"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -302,8 +302,8 @@ contains
         if (p == 0) return
 
         if (shear_stress) then    ! Shear stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
-                "alpha_rho_visc", "Re_visc", "tau_Re"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -411,8 +411,8 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private=["alpha_visc", &
-                "alpha_rho_visc", "Re_visc", "tau_Re"])
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
@@ -549,7 +549,7 @@ contains
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
 
-            $:GPU_UPDATE(device=["iv"])
+            $:GPU_UPDATE(device='[iv]')
 
             call s_reconstruct_cell_boundary_values_visc( &
                 q_prim_qp%vf(iv%beg:iv%end), &
@@ -587,11 +587,11 @@ contains
         else ! Compute velocity gradient at cell centers using finite differences
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
-            $:GPU_UPDATE(device=["iv"])
+            $:GPU_UPDATE(device='[iv]')
 
             is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-            $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+            $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
             $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
@@ -1319,7 +1319,7 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        $:GPU_UPDATE(device=["is1_viscous","is2_viscous","is3_viscous"])
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index d965efa377..ce5d5bf82c 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -42,7 +42,7 @@ module m_weno
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: v_rs_ws_x, v_rs_ws_y, v_rs_ws_z
     !> @}
-    $:GPU_DECLARE(create=["v_rs_ws_x","v_rs_ws_y","v_rs_ws_z"])
+    $:GPU_DECLARE(create='[v_rs_ws_x,v_rs_ws_y,v_rs_ws_z]')
 
     ! WENO Coefficients
 
@@ -59,8 +59,8 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_y
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_z
     !> @}
-    $:GPU_DECLARE(create=["poly_coef_cbL_x","poly_coef_cbL_y","poly_coef_cbL_z"])
-    $:GPU_DECLARE(create=["poly_coef_cbR_x","poly_coef_cbR_y","poly_coef_cbR_z"])
+    $:GPU_DECLARE(create='[poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z]')
+    $:GPU_DECLARE(create='[poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z]')
 
     !> @name The ideal weights at the left and the right cell-boundaries and at the
     !! left and the right quadrature points, in x-, y- and z-directions. Note
@@ -75,7 +75,7 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_y
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_z
     !> @}
-    $:GPU_DECLARE(create=["d_cbL_x","d_cbL_y","d_cbL_z","d_cbR_x","d_cbR_y","d_cbR_z"])
+    $:GPU_DECLARE(create='[d_cbL_x,d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z]')
 
     !> @name Smoothness indicator coefficients in the x-, y-, and z-directions. Note
     !! that the first array dimension identifies the smoothness indicator, the
@@ -86,22 +86,22 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_y
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_z
     !> @}
-    $:GPU_DECLARE(create=["beta_coef_x","beta_coef_y","beta_coef_z"])
+    $:GPU_DECLARE(create='[beta_coef_x,beta_coef_y,beta_coef_z]')
 
     ! END: WENO Coefficients
 
     integer :: v_size !< Number of WENO-reconstructed cell-average variables
-    $:GPU_DECLARE(create=["v_size"])
+    $:GPU_DECLARE(create='[v_size]')
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
-    $:GPU_DECLARE(create=["is1_weno","is2_weno","is3_weno"])
+    $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]')
     !
     !> @}
 
     real(wp) :: test
-    $:GPU_DECLARE(create=["test"])
+    $:GPU_DECLARE(create='[test]')
 
 contains
 
@@ -624,11 +624,11 @@ contains
         #:endfor
 
         if (weno_dir == 1) then
-            $:GPU_UPDATE(device=["poly_coef_cbL_x","poly_coef_cbR_x","d_cbL_x","d_cbR_x","beta_coef_x"])
+            $:GPU_UPDATE(device='[poly_coef_cbL_x,poly_coef_cbR_x,d_cbL_x,d_cbR_x,beta_coef_x]')
         elseif (weno_dir == 2) then
-            $:GPU_UPDATE(device=["poly_coef_cbL_y","poly_coef_cbR_y","d_cbL_y","d_cbR_y","beta_coef_y"])
+            $:GPU_UPDATE(device='[poly_coef_cbL_y,poly_coef_cbR_y,d_cbL_y,d_cbR_y,beta_coef_y]')
         else
-            $:GPU_UPDATE(device=["poly_coef_cbL_z","poly_coef_cbR_z","d_cbL_z","d_cbR_z","beta_coef_z"])
+            $:GPU_UPDATE(device='[poly_coef_cbL_z,poly_coef_cbR_z,d_cbL_z,d_cbR_z,beta_coef_z]')
         end if
 
         ! Nullifying WENO coefficients and cell-boundary locations pointers
@@ -662,7 +662,7 @@ contains
         is2_weno = is2_weno_d
         is3_weno = is3_weno_d
 
-        $:GPU_UPDATE(device=["is1_weno","is2_weno","is3_weno"])
+        $:GPU_UPDATE(device='[is1_weno,is2_weno,is3_weno]')
 
         if (weno_order /= 1) then
             call s_initialize_weno(v_vf, &
@@ -713,7 +713,7 @@ contains
         elseif (weno_order == 3) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:GPU_PARALLEL_LOOP(collapse=4,private=["beta","dvd","poly","omega","alpha","tau"])
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[beta,dvd,poly,omega,alpha,tau]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -792,7 +792,7 @@ contains
         elseif (weno_order == 5) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:GPU_PARALLEL_LOOP(collapse=3,private=["dvd","poly","beta","alpha","omega","tau","delta"])
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -912,7 +912,7 @@ contains
         elseif (weno_order == 7) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    $:GPU_PARALLEL_LOOP(collapse=3,private=["poly","beta","alpha","omega","tau","delta","dvd","v"])
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -1132,7 +1132,7 @@ contains
         ! as to reshape the inputted data in the coordinate direction of
         ! the WENO reconstruction
         v_size = ubound(v_vf, 1)
-        $:GPU_UPDATE(device=["v_size"])
+        $:GPU_UPDATE(device='[v_size]')
 
         if (weno_dir == 1) then
             $:GPU_PARALLEL_LOOP(collapse=4)
@@ -1266,7 +1266,7 @@ contains
         real(wp), parameter :: alpha_mp = 2._wp
         real(wp), parameter :: beta_mp = 4._wp/3._wp
 
-        $:GPU_PARALLEL_LOOP(collapse=4,private=["d"])
+        $:GPU_PARALLEL_LOOP(collapse=4,private='[d]')
         do l = is3_weno%beg, is3_weno%end
             do k = is2_weno%beg, is2_weno%end
                 do j = is1_weno%beg, is1_weno%end

From ca8f970ee857c35f109ab0d68106b9d07f5d48c9 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 17:35:45 -0400
Subject: [PATCH 50/75] Fixed gpu loop defaults and changed some asserts

---
 src/common/include/parallel_macros.fpp | 6 +++---
 src/common/m_phase_change.fpp          | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index d34b32065f..37fc16c4a7 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -20,8 +20,8 @@
             #:elif c == ')'
                 #:set count = count - 1
             #:endif
-            #:if count > 1
-                #:stop 'Nested parentheses is not supported. Incorrect clause: {}'.format(clause_str)
+            #:if c == ',' and count > 1
+                #:stop 'Nested parentheses with comma inside is not supported. Incorrect clause: {}'.format(clause_str)
             #:elif count < 0
                 #:stop 'Missing parentheses. Incorrect clause: {}'.format(clause_str)
             #:endif
@@ -289,7 +289,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+#:def GPU_LOOP(collapse=None, parallelism=['seq'], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 27c25e09e5..60710eae7e 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -100,9 +100,9 @@ contains
 
         ! starting equilibrium solver
         $:GPU_PARALLEL_LOOP(collapse=3, private='[p_infOV, p_infpT, p_infSL, &
-            sk, hk, gk, ek, rhok,pS, pSOV, pSSL, &
-            TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, &
-            dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
+            & sk, hk, gk, ek, rhok,pS, pSOV, pSSL, &
+            & TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, &
+            & dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
         do j = 0, m
             do k = 0, n
                 do l = 0, p

From ba189e99219aaff27d76778e7b58bd462526e0d9 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 17:47:12 -0400
Subject: [PATCH 51/75] Switched parallelism from python list to string list

---
 src/common/include/parallel_macros.fpp  | 15 ++++++-----
 src/common/m_boundary_common.fpp        | 22 ++++++++--------
 src/common/m_helper.fpp                 |  4 +--
 src/common/m_helper_basic.fpp           |  6 ++---
 src/common/m_phase_change.fpp           | 12 ++++-----
 src/common/m_variables_conversion.fpp   | 10 ++++----
 src/pre_process/m_assign_variables.fpp  |  4 +--
 src/pre_process/m_patches.fpp           |  8 +++---
 src/simulation/m_acoustic_src.fpp       |  6 ++---
 src/simulation/m_bubbles.fpp            | 34 ++++++++++++-------------
 src/simulation/m_bubbles_EL.fpp         |  4 +--
 src/simulation/m_bubbles_EL_kernels.fpp | 12 ++++-----
 src/simulation/m_compute_cbc.fpp        | 16 ++++++------
 src/simulation/m_hyperelastic.fpp       |  4 +--
 src/simulation/m_ibm.fpp                |  2 +-
 src/simulation/m_qbmm.fpp               | 14 +++++-----
 src/simulation/m_riemann_solvers.fpp    |  4 +--
 src/simulation/m_sim_helpers.fpp        |  6 ++---
 18 files changed, 93 insertions(+), 90 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 37fc16c4a7..6bfc432a0f 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -118,10 +118,13 @@
 
 #:def GEN_PARALLELISM_STR(parallelism)
     #:if parallelism is not None
-        $:ASSERT_LIST(parallelism, str)
+        #:assert isinstance(parallelism, str)
+        #:assert parallelism[0] == '[' and parallelism[-1] == ']'
+        #:set parallelism_list = [x.strip() for x in parallelism.strip('[]').split(',')]
+        $:ASSERT_LIST(parallelism_list, str)
         #:assert all((element == 'gang' or element == 'worker' or &
-            & element == 'vector' or element == 'seq') for element in parallelism)
-        #:set parallelism_val = ' '.join(parallelism) + ' '
+            & element == 'vector' or element == 'seq') for element in parallelism_list)
+        #:set parallelism_val = ' '.join(parallelism_list) + ' '
     #:else
         #:set parallelism_val = ''
     #:endif
@@ -191,7 +194,7 @@
 #:enddef
 
 
-#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism=['gang', 'vector'], &
+#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', &
     & default='present', firstprivate=None, reduction=None, reductionOp=None, &
     & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
     & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
@@ -235,7 +238,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_ROUTINE(function_name=None, parallelism=['seq'], nohost=False, cray_inline=False, extraAccArgs=None)
+#:def GPU_ROUTINE(function_name=None, parallelism='[seq]', nohost=False, cray_inline=False, extraAccArgs=None)
     #:assert isinstance(cray_inline, bool)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
@@ -289,7 +292,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_LOOP(collapse=None, parallelism=['seq'], data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+#:def GPU_LOOP(collapse=None, parallelism='[seq]', data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index eb08f05b39..79e992be22 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -276,7 +276,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
@@ -337,7 +337,7 @@ contains
     end subroutine s_ghost_cell_extrapolation
 
     pure subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb, mv)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -597,7 +597,7 @@ contains
     end subroutine s_symmetry
 
     pure subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb, mv)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -736,7 +736,7 @@ contains
     end subroutine s_periodic
 
     pure subroutine s_axis(q_prim_vf, pb, mv, k, l)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: k, l
@@ -798,7 +798,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_slip_wall
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
@@ -892,7 +892,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_no_slip_wall
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
@@ -1022,7 +1022,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_dirichlet
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
@@ -1087,7 +1087,7 @@ contains
     end subroutine s_dirichlet
 
     pure subroutine s_qbmm_extrapolation(bc_dir, bc_loc, k, l, pb, mv)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1285,7 +1285,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_periodic
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1343,7 +1343,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_reflective
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
@@ -1425,7 +1425,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 3554d8f2ef..f222e24d50 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -44,7 +44,7 @@ contains
         !! @param Rtmp is the  bubble radii
         !! @param ntmp is the output number bubble density
     pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out) :: ntmp
@@ -58,7 +58,7 @@ contains
     end subroutine s_comp_n_from_prim
 
     pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out) :: ntmp
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 35300ff3a6..7a624dfbb4 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -26,7 +26,7 @@ contains
     !! @param tol_input Relative error (default = 1e-10_wp).
     !! @return Result of the comparison.
     logical pure elemental function f_approx_equal(a, b, tol_input) result(res)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: a, b
         real(wp), optional, intent(in) :: tol_input
         real(wp) :: tol
@@ -78,7 +78,7 @@ contains
     !> Checks if a real(wp) variable is of default value.
     !! @param var Variable to check.
     logical pure elemental function f_is_default(var) result(res)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, dflt_real)
@@ -103,7 +103,7 @@ contains
     !> Checks if a real(wp) variable is an integer.
     !! @param var Variable to check.
     logical pure elemental function f_is_integer(var) result(res)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, real(nint(var), wp))
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 60710eae7e..884fb8bc96 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -291,7 +291,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         ! initializing variables
@@ -395,7 +395,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         integer, intent(in) :: j, k, l
@@ -520,7 +520,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_correct_partial_densities
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         !> @name variables for the correction of the reacting partial densities
@@ -583,7 +583,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_jacobian_matrix
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
@@ -690,7 +690,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pTg_residue
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         integer, intent(in) :: j, k, l
@@ -741,7 +741,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_TSat
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(in) :: pSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 6c7ce05828..f861738684 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -120,7 +120,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_pressure
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(in) :: energy, alf
@@ -461,7 +461,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -542,7 +542,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
@@ -1607,7 +1607,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_speed_of_sound
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(in) :: pres
@@ -1677,7 +1677,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(in) :: B(3), rho, c
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index b814265a6e..7bc91f1479 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -104,7 +104,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, &
                                                                eta, q_prim_vf, patch_id_fp)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
@@ -277,7 +277,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     impure subroutine s_assign_patch_species_primitive_variables(patch_id, j, k, l, &
                                                                  eta, q_prim_vf, patch_id_fp)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index 13455a3985..cd043b8f09 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -2378,7 +2378,7 @@ contains
     end subroutine s_model
 
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), intent(in) :: cyl_y, cyl_z
 
@@ -2389,7 +2389,7 @@ contains
 
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         t_vec3, intent(in) :: cyl
         t_vec3 :: cart
@@ -2401,7 +2401,7 @@ contains
     end function f_convert_cyl_to_cart
 
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), intent(IN) :: cyl_x, cyl_y
 
@@ -2414,7 +2414,7 @@ contains
     !! @param offset Thickness
     !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: myth, offset, a
         real(wp) :: b
         real(wp) :: f_r
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index c604d2bb35..fac80b8032 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -346,7 +346,7 @@ contains
     !! @param gauss_sigma_time_local sigma in time for Gaussian pulse
     !! @param source Source term amplitude
     pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         integer, intent(in) :: ai, term_index
         real(wp), intent(in) :: sim_time, c, sum_BB
         real(wp), intent(in) :: frequency_local, gauss_sigma_time_local
@@ -698,7 +698,7 @@ contains
     !! @param c Speed of sound
     !! @return frequency_local Converted frequency
     pure elemental function f_frequency_local(freq_conv_flag, ai, c)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         logical, intent(in) :: freq_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
@@ -717,7 +717,7 @@ contains
     !! @param ai Acoustic source index
     !! @return gauss_sigma_time_local Converted Gaussian sigma time
     pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         logical, intent(in) :: gauss_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index c7b05ce116..7cf1c80f11 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -41,7 +41,7 @@ contains
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
     pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(in) :: fCson
@@ -82,7 +82,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw(fR0, fR, fV, fpb)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
         real(wp) :: f_cpbw
@@ -101,7 +101,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
         real(wp) :: tmp1, tmp2, tmp3
@@ -121,7 +121,7 @@ contains
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
     pure elemental function f_cgas(fCpinf, fntait, fBtait, fH)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
         real(wp) :: tmp
@@ -144,7 +144,7 @@ contains
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
     pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
         real(wp) :: c2_liquid
@@ -174,7 +174,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
     pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
 
@@ -210,7 +210,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fCpbw Boundary wall pressure
     pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw
 
         real(wp) :: f_rddot_RP
@@ -233,7 +233,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
 
@@ -256,7 +256,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw_KM(fR0, fR, fV, fpb)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
 
@@ -283,7 +283,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
     pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
 
@@ -317,7 +317,7 @@ contains
         !!  @param pb Internal bubble pressure
         !!  @param iR0 Current bubble size index
     pure elemental subroutine s_bwproperty(pb, iR0, chi_vw, k_mw, rho_mw)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: pb
         integer, intent(in) :: iR0
         real(wp), intent(out) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -347,7 +347,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
         real(wp), intent(in) :: fpb
@@ -405,7 +405,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -468,7 +468,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_step
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
@@ -602,7 +602,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_initial_substep_h
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -688,7 +688,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_advance_substep
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
@@ -786,7 +786,7 @@ contains
         !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
     pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
                                            fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
         real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t
         integer, intent(IN) :: bub_id
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index f556cea019..f573e2879e 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -743,7 +743,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_cson_from_pinf
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
@@ -813,7 +813,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_pinf
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index daac6b8238..c37f71df3b 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -203,7 +203,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_applygaussian
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
@@ -273,7 +273,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_check_celloutside
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
@@ -309,7 +309,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
@@ -350,7 +350,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_stddsv
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
@@ -391,7 +391,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_char_vol
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
@@ -416,7 +416,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_get_cell
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 5bdbc6715d..58a12cabb9 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -88,7 +88,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_slip_wall_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -106,7 +106,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -136,7 +136,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -153,7 +153,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -177,7 +177,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -199,7 +199,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
@@ -221,7 +221,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(sys_size), intent(inout) :: L
         L(1:advxe) = 0._wp
@@ -233,7 +233,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index ae31b0084b..84170ef0a1 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -220,7 +220,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_neoHookean_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -259,7 +259,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index c9fc9ca1c4..a156efd94f 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -764,7 +764,7 @@ contains
     !> Function that uses the interpolation coefficients and the current state
     !! at the cell centers in order to estimate the state at the image point
     pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), &
             dimension(sys_size), &
             intent(IN) :: q_prim_vf !< Primitive Variables
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 93361bf0a5..e9764a5d0f 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -566,7 +566,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff_nonpoly
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -639,7 +639,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_coeff
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         real(wp), intent(in) :: pres, rho, c
@@ -864,7 +864,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:GPU_ROUTINE(parallelism=['seq'])
+            $:GPU_ROUTINE(parallelism='[seq]')
 #endif
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
@@ -880,7 +880,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_chyqmom
 #else
-            $:GPU_ROUTINE(parallelism=['seq'])
+            $:GPU_ROUTINE(parallelism='[seq]')
 #endif
             real(wp), dimension(nmom), intent(in) :: momin
             real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
@@ -940,7 +940,7 @@ contains
 #ifdef _CRAYFTN
             !DIR$ INLINEALWAYS s_hyqmom
 #else
-            $:GPU_ROUTINE(parallelism=['seq'])
+            $:GPU_ROUTINE(parallelism='[seq]')
 #endif
             real(wp), dimension(2), intent(inout) :: frho, fup
             real(wp), dimension(3), intent(in) :: fmom
@@ -959,7 +959,7 @@ contains
         end subroutine s_hyqmom
 
         pure function f_quad(abscX, abscY, wght_in, q, r, s)
-            $:GPU_ROUTINE(parallelism=['seq'])
+            $:GPU_ROUTINE(parallelism='[seq]')
             real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
             real(wp), intent(in) :: q, r, s
 
@@ -975,7 +975,7 @@ contains
         end function f_quad
 
         pure function f_quad2D(abscX, abscY, wght_in, pow)
-            $:GPU_ROUTINE(parallelism=['seq'])
+            $:GPU_ROUTINE(parallelism='[seq]')
             real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
             real(wp), dimension(3), intent(in) :: pow
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 58578f89cb..448264d38a 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -4243,7 +4243,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
     pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         implicit none
 
@@ -4277,7 +4277,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
     pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         implicit none
 
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 3e17e8364f..e8efc4be29 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -34,7 +34,7 @@ contains
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_compute_enthalpy
 #else
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
 #endif
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
@@ -99,7 +99,7 @@ contains
         !! @param vcfl_sf (optional) cell centered viscous cfl number
         !! @param Rc_sf (optional) cell centered Rc
     pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
@@ -196,7 +196,7 @@ contains
         !! @param k y coordinate
         !! @param l z coordinate
     pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
-        $:GPU_ROUTINE(parallelism=['seq'])
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt

From 630163b3ec06b26390d5c250b373bfff11564e60 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 18:23:03 -0400
Subject: [PATCH 52/75] Removed default for GPU atomic and fixed GPU wait

---
 src/common/include/parallel_macros.fpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 6bfc432a0f..7dc0329235 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -389,7 +389,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_ATOMIC(atomic='update', extraAccArgs=None)
+#:def GPU_ATOMIC(atomic, extraAccArgs=None)
     #:assert isinstance(atomic, str)
     #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
 
@@ -414,11 +414,11 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_WAIT(host=None, device=None, extraAccArgs=None)
+#:def GPU_WAIT(extraAccArgs=None)
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
-    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
-    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    #:set clause_val = ''
+    #:set acc_directive = '!$acc wait ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
 ! New line at end of file is required for FYPP
\ No newline at end of file

From c03adf944cdc0ed174e7110d426d5aabbdd99bf1 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 18:58:52 -0400
Subject: [PATCH 53/75] Replaced many more directives

---
 src/common/m_boundary_common.fpp         | 14 +++----
 src/common/m_helper_basic.fpp            |  2 +-
 src/simulation/m_acoustic_src.fpp        |  4 +-
 src/simulation/m_bubbles_EL_kernels.fpp  |  2 +-
 src/simulation/m_compute_cbc.fpp         | 10 ++---
 src/simulation/m_fftw.fpp                |  2 +-
 src/simulation/m_mpi_proxy.fpp           | 14 +++----
 src/simulation/m_pressure_relaxation.fpp | 52 ++++++++++++------------
 src/simulation/m_qbmm.fpp                |  4 +-
 src/simulation/m_viscous.fpp             |  6 +--
 10 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 79e992be22..d76feef205 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -1638,7 +1638,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_type(dir, loc)%sf
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -1654,7 +1654,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_buffers(dir, loc)%sf
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -1704,7 +1704,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), MPI_INTEGER, MPI_BC_TYPE_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_type(dir, loc)%sf, 1, MPI_BC_TYPE_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_type(dir, loc)%sf)
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
 
@@ -1714,7 +1714,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_p, MPI_BC_BUFFER_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_buffers(dir, loc)%sf, 1, MPI_BC_BUFFER_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_buffers(dir, loc)%sf)
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
 
@@ -1767,17 +1767,17 @@ contains
 
         bc_type(1, -1)%sf(:, :, :) = bc_x%beg
         bc_type(1, 1)%sf(:, :, :) = bc_x%end
-        !$acc update device(bc_type(1,-1)%sf, bc_type(1,1)%sf)
+        $:GPU_UPDATE(device='[bc_type(1,-1)%sf,bc_type(1,1)%sf]')
 
         if (n > 0) then
             bc_type(2, -1)%sf(:, :, :) = bc_y%beg
             bc_type(2, 1)%sf(:, :, :) = bc_y%end
-            !$acc update device(bc_type(2,-1)%sf, bc_type(2,1)%sf)
+            $:GPU_UPDATE(device='[bc_type(2,-1)%sf,bc_type(2,1)%sf]')
 
             if (p > 0) then
                 bc_type(3, -1)%sf(:, :, :) = bc_z%beg
                 bc_type(3, 1)%sf(:, :, :) = bc_z%end
-                !$acc update device(bc_type(3,-1)%sf, bc_type(3,1)%sf)
+                $:GPU_UPDATE(device='[bc_type(3,-1)%sf,bc_type(3,1)%sf]')
             end if
         end if
 
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 7a624dfbb4..c21956628b 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -52,7 +52,7 @@ contains
     !! @param tol_input Relative error (default = 1e-10_wp).
     !! @return Result of the comparison.
     logical pure function f_approx_in_array(a, b, tol_input) result(res)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: a
         real(wp), intent(in) :: b(:)
         real(wp), optional, intent(in) :: tol_input
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index fac80b8032..cdaf92828a 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -205,7 +205,7 @@ contains
                 call s_mpi_send_random_number(phi_rn, bb_num_freq(ai))
             end if
 
-            !$acc loop reduction(+:sum_BB)
+            $:GPU_LOOP(reduction='[[sum_BB]]', reductionOp='[+]')
             do k = 1, bb_num_freq(ai)
                 ! Acoustic period of the wave at each discrete frequency
                 period_BB = 1._wp/(bb_lowest_freq(ai) + k*bb_bandwidth(ai))
@@ -230,7 +230,7 @@ contains
                 B_tait = 0._wp
                 small_gamma = 0._wp
 
-                !$acc loop
+                $:GPU_LOOP()
                 do q = 1, num_fluids
                     myalpha_rho(q) = q_cons_vf(q)%sf(j, k, l)
                     myalpha(q) = q_cons_vf(advxb + q - 1)%sf(j, k, l)
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index c37f71df3b..433a4fa8d1 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -134,7 +134,7 @@ contains
             strength_vol = volpart
             strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
 
-            !$acc loop collapse(3) private(cellaux, nodecoord)
+            $:GPU_LOOP(collapse=3,private='[cellaux,nodecoord]')
             do i = 1, smearGrid
                 do j = 1, smearGrid
                     do k = 1, smearGridz
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 58a12cabb9..de6af78c45 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -20,7 +20,7 @@ module m_compute_cbc
 contains
     !> Base L1 calculation
     pure function f_base_L1(lambda, rho, c, dpres_ds, dvel_ds) result(L1)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in) :: rho, c, dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -30,7 +30,7 @@ contains
 
     !> Fill density L variables
     pure subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
@@ -44,7 +44,7 @@ contains
 
     !> Fill velocity L variables
     pure subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -57,7 +57,7 @@ contains
 
     !> Fill advection L variables
     pure subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -70,7 +70,7 @@ contains
 
     !> Fill chemistry L variables
     pure subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_species), intent(in) :: dYs_ds
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index b30bdbcf93..a8bc439606 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -161,7 +161,7 @@ contains
         p_cmplx => data_cmplx_gpu
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
-!$acc data attach(p_real, p_cmplx, p_fltr_cmplx)
+$:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
 !$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx)
 #if defined(__PGI)
         ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 2cd0f97e2a..e500a00898 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -71,7 +71,7 @@ contains
                 i_halo_size = -1 + gp_layers
             end if
 
-            !$acc update device(i_halo_size)
+            $:GPU_UPDATE(device='[i_halo_size]')
             @:ALLOCATE(ib_buff_send(0:i_halo_size), ib_buff_recv(0:i_halo_size))
         end if
 #endif
@@ -297,7 +297,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = 0, gp_layers - 1
@@ -307,7 +307,7 @@ contains
                         end do
                     end do
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, gp_layers - 1
                             do j = -gp_layers, m + gp_layers
@@ -318,7 +318,7 @@ contains
                         end do
                     end do
                 #:else
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, gp_layers - 1
                         do k = -gp_layers, n + gp_layers
                             do j = -gp_layers, m + gp_layers
@@ -345,7 +345,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = -gp_layers, -1
@@ -355,7 +355,7 @@ contains
                         end do
                     end do
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = -gp_layers, -1
                             do j = -gp_layers, m + gp_layers
@@ -367,7 +367,7 @@ contains
                     end do
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = -gp_layers, -1
                         do k = -gp_layers, n + gp_layers
                             do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index 8aa78ad88a..d3a63ad6f6 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -21,10 +21,10 @@ module m_pressure_relaxation
  s_finalize_pressure_relaxation_module
 
     real(wp), allocatable, dimension(:) :: gamma_min, pres_inf
-    !$acc declare create(gamma_min, pres_inf)
+    $:GPU_DECLARE(create='[gamma_min, pres_inf]')
 
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(Res)
+    $:GPU_DECLARE(create='[Res]')
 
 contains
 
@@ -39,7 +39,7 @@ contains
             gamma_min(i) = 1._wp/fluid_pp(i)%gamma + 1._wp
             pres_inf(i) = fluid_pp(i)%pi_inf/(1._wp + fluid_pp(i)%gamma)
         end do
-        !$acc update device(gamma_min, pres_inf)
+        $:GPU_UPDATE(device='[gamma_min, pres_inf]')
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -48,7 +48,7 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            $:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
         end if
 
     end subroutine s_initialize_pressure_relaxation_module
@@ -70,7 +70,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer :: j, k, l
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -83,7 +83,7 @@ contains
 
     !> Process pressure relaxation for a single cell
     pure subroutine s_relax_cell_pressure(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -103,14 +103,14 @@ contains
 
     !> Check if pressure relaxation is needed for this cell
     pure logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer, intent(in) :: j, k, l
         integer :: i
 
         s_needs_pressure_relaxation = .true.
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
                 s_needs_pressure_relaxation = .false.
@@ -121,7 +121,7 @@ contains
 
     !> Correct volume fractions to physical bounds
     pure subroutine s_correct_volume_fractions(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -129,7 +129,7 @@ contains
         integer :: i
 
         sum_alpha = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
                 (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
@@ -142,7 +142,7 @@ contains
             sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
         end do
 
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
         end do
@@ -151,7 +151,7 @@ contains
 
     !> Main pressure equilibration using Newton-Raphson
     pure subroutine s_equilibrate_pressure(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -164,7 +164,7 @@ contains
 
         ! Initialize pressures
         pres_relax = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                 pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
@@ -180,7 +180,7 @@ contains
         ! Newton-Raphson iteration
         f_pres = 1e-9_wp
         df_pres = 1e9_wp
-        !$acc loop seq
+        $:GPU_LOOP()
         do iter = 0, MAX_ITER - 1
             if (abs(f_pres) > TOLERANCE) then
                 pres_relax = pres_relax - f_pres/df_pres
@@ -194,7 +194,7 @@ contains
                 ! Newton-Raphson step
                 f_pres = -1._wp
                 df_pres = 0._wp
-                !$acc loop seq
+                $:GPU_LOOP()
                 do i = 1, num_fluids
                     if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                         rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
@@ -210,7 +210,7 @@ contains
         end do
 
         ! Update volume fractions
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
@@ -220,7 +220,7 @@ contains
 
     !> Correct internal energies using equilibrated pressure
     pure subroutine s_correct_internal_energies(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -230,7 +230,7 @@ contains
         real(wp), dimension(2) :: Re
         integer :: i, q
 
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
             alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
@@ -243,14 +243,14 @@ contains
 
         if (bubbles_euler) then
             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                !$acc loop seq
+                $:GPU_LOOP()
                 do i = 1, num_fluids
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
                     pi_inf = pi_inf + alpha(i)*pi_infs(i)
                 end do
             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                !$acc loop seq
+                $:GPU_LOOP()
                 do i = 1, num_fluids - 1
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
@@ -264,7 +264,7 @@ contains
         else
             sum_alpha = 0._wp
             if (mpp_lim) then
-                !$acc loop seq
+                $:GPU_LOOP()
                 do i = 1, num_fluids
                     alpha_rho(i) = max(0._wp, alpha_rho(i))
                     alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
@@ -273,7 +273,7 @@ contains
                 alpha = alpha/max(sum_alpha, sgm_eps)
             end if
 
-            !$acc loop seq
+            $:GPU_LOOP()
             do i = 1, num_fluids
                 rho = rho + alpha_rho(i)
                 gamma = gamma + alpha(i)*gammas(i)
@@ -281,11 +281,11 @@ contains
             end do
 
             if (viscous) then
-                !$acc loop seq
+                $:GPU_LOOP()
                 do i = 1, 2
                     Re(i) = dflt_real
                     if (Re_size(i) > 0) Re(i) = 0._wp
-                    !$acc loop seq
+                    $:GPU_LOOP()
                     do q = 1, Re_size(i)
                         Re(i) = alpha(Re_idx(i, q))/Res(i, q) + Re(i)
                     end do
@@ -296,7 +296,7 @@ contains
 
         ! Compute dynamic pressure and update internal energies
         dyn_pres = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = momxb, momxe
             dyn_pres = dyn_pres + 5e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
                        q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
@@ -304,7 +304,7 @@ contains
 
         pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
 
-        !$acc loop seq
+        $:GPU_LOOP()
         do i = 1, num_fluids
             q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index e9764a5d0f..1311afc5d6 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -538,13 +538,13 @@ contains
 
         ! The following block is not repeated and is left as is
         if (idir == 1) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
                         j = bubxb
-                        !$acc loop seq
+                        $:GPU_LOOP()
                         do k = 1, nb
                             rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
                             rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l)
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index cecfb80d34..16420ab3de 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -999,7 +999,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         if (n > 0) then
             if (p > 0) then
@@ -1096,7 +1096,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         if (n > 0) then
             if (p > 0) then
@@ -1199,7 +1199,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then

From c07345015bb75ed1a709cd75ffc388ff97f6b7c6 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 18:59:34 -0400
Subject: [PATCH 54/75] Add host_data directive

---
 src/common/include/parallel_macros.fpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 7dc0329235..b8fe35a0e2 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -193,6 +193,11 @@
     $:device_val
 #:enddef
 
+#:def GEN_USE_DEVICE_STR(use_device)
+    #:set use_device_val = GEN_PARENTHESES_CLAUSE('use_device', use_device)
+    $:use_device_val
+#:enddef
+
 
 #:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', &
     & default='present', firstprivate=None, reduction=None, reductionOp=None, &
@@ -351,6 +356,15 @@
     $:acc_directive
 #:enddef
 
+#:def GPU_HOST_DATA(use_device=None, extraAccArgs=None)
+    #:set use_device_val = GEN_USE_DEVICE_STR(use_device)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+
+    #:set clause_val = use_device_val.strip('\n')
+    #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
 #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
 

From c32c956f39f8547133de51e651d89c70c744cf0d Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 19:03:33 -0400
Subject: [PATCH 55/75] Replaced atomic directives

---
 src/simulation/m_bubbles_EL_kernels.fpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index 433a4fa8d1..3d2f1089ac 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -74,19 +74,19 @@ contains
 
             !Update void fraction field
             addFun1 = strength_vol/Vol
-            !$acc atomic update
+            $:GPU_ATOMIC(atomic='update')
             updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) + addFun1
 
             !Update time derivative of void fraction
             addFun2 = strength_vel/Vol
-            !$acc atomic update
+            $:GPU_ATOMIC(atomic='update')
             updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) + addFun2
 
             !Product of two smeared functions
             !Update void fraction * time derivative of void fraction
             if (lag_params%cluster_type >= 4) then
                 addFun3 = (strength_vol*strength_vel)/Vol
-                !$acc atomic update
+                $:GPU_ATOMIC(atomic='update')
                 updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) + addFun3
             end if
         end do
@@ -170,14 +170,14 @@ contains
 
                         !Update void fraction field
                         addFun1 = func*strength_vol
-                        !$acc atomic update
+                        $:GPU_ATOMIC(atomic='update')
                         updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                             updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                             + addFun1
 
                         !Update time derivative of void fraction
                         addFun2 = func*strength_vel
-                        !$acc atomic update
+                        $:GPU_ATOMIC(atomic='update')
                         updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                             updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                             + addFun2
@@ -186,7 +186,7 @@ contains
                         !Update void fraction * time derivative of void fraction
                         if (lag_params%cluster_type >= 4) then
                             addFun3 = func2*strength_vol*strength_vel
-                            !$acc atomic update
+                            $:GPU_ATOMIC(atomic='update')
                             updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                                 updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                                 + addFun3

From eb01272ff295a9aa474a065fc209f0e3ef67781a Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 19:06:08 -0400
Subject: [PATCH 56/75] Replaced host data directives

---
 src/simulation/m_fftw.fpp | 8 ++++----
 src/simulation/m_weno.fpp | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index a8bc439606..f37196b82e 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -162,7 +162,7 @@ contains
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
 $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
-!$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx)
+$:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
@@ -182,7 +182,7 @@ $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
             end do
         end do
 
-!$acc host_data use_device(p_real, p_fltr_cmplx)
+$:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
@@ -221,7 +221,7 @@ $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
                 end do
             end do
 
-!$acc host_data use_device(p_real, p_cmplx)
+$:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
@@ -242,7 +242,7 @@ $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
                 end do
             end do
 
-!$acc host_data use_device(p_real, p_fltr_cmplx)
+$:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index f049e913f9..1c88dcbaf2 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1155,7 +1155,7 @@ contains
                     block
                         use CuTensorEx
 
-                        !$acc host_data use_device(v_rs_ws_x, v_rs_ws_y)
+                        $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
                         !$acc end host_data
                     end block
@@ -1163,7 +1163,7 @@ contains
                     block
                         use CuTensorEx
 
-                        !$acc host_data use_device(v_rs_ws_x, v_rs_ws_y)
+                        $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
                         !$acc end host_data
                     end block
@@ -1194,7 +1194,7 @@ contains
                 block
                     use CuTensorEx
 
-                    !$acc host_data use_device(v_rs_ws_x, v_rs_ws_z)
+                    $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
                     v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
                     !$acc end host_data
                 end block

From 4159b74aebb5f76aaebafca9164185c8d04ee294 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 19:49:56 -0400
Subject: [PATCH 57/75] Removed all end directives excpet host_data and kernels

---
 src/common/m_variables_conversion.fpp |  1 -
 src/simulation/m_data_output.fpp      |  1 -
 src/simulation/m_hyperelastic.fpp     |  1 -
 src/simulation/m_hypoelastic.fpp      |  6 ------
 src/simulation/m_mhd.fpp              |  1 -
 src/simulation/m_rhs.fpp              |  3 ---
 src/simulation/m_riemann_solvers.fpp  |  6 ------
 src/simulation/m_surface_tension.fpp  |  3 ---
 src/simulation/m_weno.fpp             | 13 -------------
 9 files changed, 35 deletions(-)

diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index f861738684..067ae58142 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -1145,7 +1145,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_convert_conservative_to_primitive_variables
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 6d11a68ade..6046113cd5 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -297,7 +297,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
         ! end: Computing Stability Criteria at Current Time-step
 
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 84170ef0a1..806765c981 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -208,7 +208,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
     end subroutine s_hyperelastic_rmt_stress_update
 
     !>  The following subroutine handles the calculation of the btensor.
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 3e42ecf364..03a15d5471 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -112,7 +112,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
 
             $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
@@ -127,7 +126,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
 
             if (ndirs > 1) then
                 $:GPU_PARALLEL_LOOP(collapse=3)
@@ -138,7 +136,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
 
                 $:GPU_PARALLEL_LOOP(collapse=3)
                 do q = 0, p
@@ -156,7 +153,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
 
                 ! 3D
                 if (ndirs == 3) then
@@ -170,7 +166,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do q = 0, p
@@ -192,7 +187,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             end if
 
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index b17ec69271..e6ee464c22 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -129,7 +129,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_mhd_powell_rhs
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index fe90fb7248..8e7ef5f628 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -1742,7 +1742,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 2) then
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
@@ -1755,7 +1754,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 3) then
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
@@ -1768,7 +1766,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
     end subroutine s_reconstruct_cell_boundary_values_first_order
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 448264d38a..eec1e9fafd 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -1968,7 +1968,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
                     $:GPU_PARALLEL_LOOP(collapse=3, private='[R0_L, R0_R, V0_L, &
@@ -2439,7 +2438,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
                     $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, &
@@ -2954,7 +2952,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             end if
         #:endfor
@@ -3301,7 +3298,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             end if
         #:endfor
 
@@ -4095,7 +4091,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_cylindrical_viscous_source_flux
 
@@ -4232,7 +4227,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_cartesian_viscous_source_flux
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index fd84e50664..30b6fcec2b 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -349,7 +349,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 2) then
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
@@ -362,7 +361,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 3) then
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
@@ -375,7 +373,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
     end subroutine s_reconstruct_cell_boundary_values_capillary
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 1c88dcbaf2..2dd5823668 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -679,7 +679,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             else if (weno_dir == 2) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
@@ -692,7 +691,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             else if (weno_dir == 3) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
@@ -705,7 +703,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             end if
         elseif (weno_order == 3) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
@@ -783,7 +780,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             #:endfor
         elseif (weno_order == 5) then
@@ -898,7 +894,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                     if (mp_weno) then
                         call s_preserve_monotonicity(v_rs_ws_${XYZ}$, vL_rs_vf_${XYZ}$, &
@@ -1094,7 +1089,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                 end if
             #:endfor
@@ -1142,7 +1136,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
         ! Reshaping/Projecting onto Characteristic Fields in y-direction
@@ -1157,7 +1150,6 @@ contains
 
                         $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
-                        !$acc end host_data
                     end block
                 else
                     block
@@ -1165,7 +1157,6 @@ contains
 
                         $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
-                        !$acc end host_data
                     end block
                 end if
             else
@@ -1180,7 +1171,6 @@ contains
                         end do
                     end do
                 end do
-!$acc end parallel loop
 #if MFC_cuTENSOR
             end if
 #endif
@@ -1196,7 +1186,6 @@ contains
 
                     $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
                     v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
-                    !$acc end host_data
                 end block
             else
 #endif
@@ -1210,7 +1199,6 @@ contains
                         end do
                     end do
                 end do
-!$acc end parallel loop
 #if MFC_cuTENSOR
             end if
 #endif
@@ -1388,7 +1376,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_preserve_monotonicity
 

From 3c64b6b6b7730711c7fcd5ebb889dac42a3c3660 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 20:04:30 -0400
Subject: [PATCH 58/75] Replaced end host data directives

---
 src/simulation/m_fftw.fpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index f37196b82e..e66b670677 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -169,7 +169,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
         ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
         call hipCheck(hipDeviceSynchronize())
 #endif
-        !$acc end host_data
+        $:GPU_END_HOST_DATA()
         Nfq = 3
         $:GPU_UPDATE(device='[Nfq]')
 
@@ -189,7 +189,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
         ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
         call hipCheck(hipDeviceSynchronize())
 #endif
-        !$acc end host_data
+        $:GPU_END_HOST_DATA()
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
@@ -228,7 +228,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
             ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
             call hipCheck(hipDeviceSynchronize())
 #endif
-            !$acc end host_data
+            $:GPU_END_HOST_DATA()
 
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
             $:GPU_UPDATE(device='[Nfq]')
@@ -249,7 +249,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
             ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
             call hipCheck(hipDeviceSynchronize())
 #endif
-            !$acc end host_data
+            $:GPU_END_HOST_DATA()
 
             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
             do k = 1, sys_size

From f74dd1d0686defe338c4182a255113966f900818 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 20:05:21 -0400
Subject: [PATCH 59/75] Added end host data directive

---
 src/common/include/parallel_macros.fpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index b8fe35a0e2..2f93b7a8c0 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -365,6 +365,11 @@
     $:acc_directive
 #:enddef
 
+#:def GPU_END_HOST_DATA()
+    #:set acc_directive = '!$acc end host_data'
+    $:acc_directive
+#:enddef
+
 #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
 

From 26a3b6f449447cd0c2d19c0ce659a868fc0907ad Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 20:40:41 -0400
Subject: [PATCH 60/75] Updated allocate and deallocate macros

---
 src/common/include/macros.fpp | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 531653f370..759ee24175 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -14,14 +14,16 @@
 
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
-    allocate (${', '.join(args)}$)
-    !$acc enter data create(${', '.join(args)}$)
+    #:set allocated_variables = ', '.join(args)
+    allocate (${allocated_variables}$)
+    $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))
 #:enddef ALLOCATE
 
 #:def DEALLOCATE(*args)
     @:LOG({'@:DEALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
-    !$acc exit data delete(${', '.join(args)}$)
-    deallocate (${', '.join(args)}$)
+    #:set allocated_variables = ', '.join(args)
+    $:GPU_EXIT_DATA(delete=('[' + allocated_variables + ']'))
+    deallocate (${allocated_variables}$)
 #:enddef DEALLOCATE
 
 #:def ACC_SETUP_VFs(*args)
@@ -32,13 +34,13 @@
         @:LOG({'@:ACC_SETUP_VFs(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
-            !$acc enter data copyin(${arg}$%vf)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
+            $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf]'))
             if (allocated(${arg}$%vf)) then
                 do macros_setup_vfs_i = lbound(${arg}$%vf, 1), ubound(${arg}$%vf, 1)
                     if (associated(${arg}$%vf(macros_setup_vfs_i)%sf)) then
-                        !$acc enter data copyin(${arg}$%vf(macros_setup_vfs_i))
-                        !$acc enter data create(${arg}$%vf(macros_setup_vfs_i)%sf)
+                        $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)]'))
+                        $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)%sf]'))
                     end if
                 end do
             end if
@@ -54,9 +56,9 @@
         @:LOG({'@:ACC_SETUP_SFs(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
             if (associated(${arg}$%sf)) then
-                !$acc enter data create(${arg}$%sf)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%sf]'))
             end if
         #:endfor
     end block
@@ -70,18 +72,18 @@
         @:LOG({'@:ACC_SETUP_source_spatials(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
             if (allocated(${arg}$%coord)) then
-                !$acc enter data create(${arg}$%coord)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%coord]'))
             end if
             if (allocated(${arg}$%val)) then
-                !$acc enter data create(${arg}$%val)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%val]'))
             end if
             if (allocated(${arg}$%angle)) then
-                !$acc enter data create(${arg}$%angle)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%angle]'))
             end if
             if (allocated(${arg}$%xyz_to_r_ratios)) then
-                !$acc enter data create(${arg}$%xyz_to_r_ratios)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%xyz_to_r_ratios]'))
             end if
         #:endfor
     end block

From 96bfabe44ab6d48cdd56bc2c2a7e4df746f5ffcf Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 18 Jun 2025 21:05:04 -0400
Subject: [PATCH 61/75] Ran formatter

---
 src/simulation/m_fftw.fpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index e66b670677..8c64193e28 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -161,8 +161,8 @@ contains
         p_cmplx => data_cmplx_gpu
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
-$:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
-$:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
+        $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
+        $:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
@@ -182,7 +182,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
             end do
         end do
 
-$:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+        $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
@@ -221,7 +221,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
                 end do
             end do
 
-$:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
+            $:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
@@ -242,7 +242,7 @@ $:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
                 end do
             end do
 
-$:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+            $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else

From 62cac03ff731c812370b35987220e37f7b8f1a30 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 19 Jun 2025 17:56:48 -0400
Subject: [PATCH 62/75] Updated parallelism defaults, Updated cray inlining and
 used replaced cray inlines

---
 src/common/include/parallel_macros.fpp    |  10 +-
 src/common/m_boundary_common.fpp          |  50 ++---
 src/common/m_chemistry.fpp                |  10 +-
 src/common/m_phase_change.fpp             |  60 ++---
 src/common/m_variables_conversion.fpp     | 109 ++++-----
 src/simulation/include/inline_riemann.fpp |   4 +-
 src/simulation/m_acoustic_src.fpp         |  10 +-
 src/simulation/m_bubbles.fpp              |  24 +-
 src/simulation/m_bubbles_EE.fpp           |  20 +-
 src/simulation/m_bubbles_EL.fpp           |  30 ++-
 src/simulation/m_bubbles_EL_kernels.fpp   |  48 ++--
 src/simulation/m_cbc.fpp                  |  64 +++---
 src/simulation/m_compute_cbc.fpp          |  64 ++----
 src/simulation/m_hyperelastic.fpp         |  16 +-
 src/simulation/m_hypoelastic.fpp          |   6 +-
 src/simulation/m_ibm.fpp                  |  24 +-
 src/simulation/m_mhd.fpp                  |   6 +-
 src/simulation/m_pressure_relaxation.fpp  |  32 +--
 src/simulation/m_qbmm.fpp                 |  64 +++---
 src/simulation/m_rhs.fpp                  |  20 +-
 src/simulation/m_riemann_solvers.fpp      | 262 +++++++++++-----------
 src/simulation/m_sim_helpers.fpp          |  13 +-
 src/simulation/m_viscous.fpp              | 104 ++++-----
 src/simulation/m_weno.fpp                 |   4 +-
 24 files changed, 474 insertions(+), 580 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 2f93b7a8c0..87868d1285 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -1,4 +1,3 @@
-
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
     #:assert isinstance(data, list)
@@ -243,7 +242,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_ROUTINE(function_name=None, parallelism='[seq]', nohost=False, cray_inline=False, extraAccArgs=None)
+#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None)
     #:assert isinstance(cray_inline, bool)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
 
@@ -264,8 +263,11 @@
             #:stop "When inlining for Cray Compiler, function name must be given and given as a string"
         #:endif
         #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
-        #:set cray_directive = '#ifdef _CRAYFTN\n' + cray_directive + '\n#else\n' + acc_directive + '\n#endif'
+#ifdef _CRAYFTN
         $:cray_directive
+#else
+        $:acc_directive
+#endif
     #:else
         $:acc_directive
     #:endif
@@ -297,7 +299,7 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_LOOP(collapse=None, parallelism='[seq]', data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
 
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
 
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index d76feef205..2d92f371e1 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -273,11 +273,8 @@ contains
     end subroutine s_populate_variables_buffers
 
     pure subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -795,11 +792,8 @@ contains
     end subroutine s_axis
 
     pure subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_slip_wall
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', &
+            & cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -889,11 +883,9 @@ contains
     end subroutine s_slip_wall
 
     pure subroutine s_no_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_no_slip_wall
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', &
+            & cray_inline=True)
+
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1019,11 +1011,8 @@ contains
     end subroutine s_no_slip_wall
 
     pure subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_dirichlet
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', &
+            & cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1282,11 +1271,8 @@ contains
     end subroutine s_populate_capillary_buffers
 
     pure subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_periodic
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_periodic', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1340,11 +1326,8 @@ contains
     end subroutine s_color_function_periodic
 
     pure subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_reflective
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_reflective', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1422,11 +1405,8 @@ contains
     end subroutine s_color_function_reflective
 
     pure subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', & 
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index b1cac0a70b..71aa890e87 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -35,7 +35,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = &
                             q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
@@ -46,7 +46,7 @@ contains
                     ! cons. contxb    = \rho         (1-fluid model)
                     ! cons. momxb + i = \rho u_i
                     energy = q_cons_vf(E_idx)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = momxb, momxe
                         energy = energy - &
                                  0.5_wp*(q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z))**2._wp
@@ -72,7 +72,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = chemxb, chemxe
                         Ys(i - chemxb + 1) = q_prim_vf(i)%sf(x, y, z)
                     end do
@@ -104,7 +104,7 @@ contains
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z)
                     end do
@@ -114,7 +114,7 @@ contains
 
                     call get_net_production_rates(rho, T, Ys, omega)
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
 
                         omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 884fb8bc96..431f6c9ec9 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -108,7 +108,7 @@ contains
                 do l = 0, p
 
                     rho = 0.0_wp; TvF = 0.0_wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
 
                         ! Mixture density
@@ -134,7 +134,7 @@ contains
 
                     ! kinetic energy as an auxiliary variable to the calculation of the total internal energy
                     dynE = 0.0_wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, momxe
 
                         dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho
@@ -256,7 +256,7 @@ contains
 
                     ! calculating volume fractions, internal energies, and total entropy
                     rhos = 0.0_wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
 
                         ! volume fractions
@@ -287,12 +287,8 @@ contains
         !!  @param rhoe mixture energy
         !!  @param TS equilibrium temperature at the interface
     pure subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', &
+            & parallelism='[seq]', cray_inline=True)
 
         ! initializing variables
         integer, intent(in) :: j, k, l, MFL
@@ -308,7 +304,7 @@ contains
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT = ps_inf; 
         ! Performing tests before initializing the pT-equilibrium
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
 
             ! sum of the total alpha*rho*cp of the system
@@ -357,7 +353,7 @@ contains
 
             ! updating functions used in the Newton's solver
             gpp = 0.0_wp; gp = 0.0_wp; hp = 0.0_wp
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
 
                 gp = gp + (gs_min(i) - 1.0_wp)*q_cons_vf(i + contxb - 1)%sf(j, k, l)*cvs(i) &
@@ -391,12 +387,8 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param TS equilibrium temperature at the interface
     pure subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', &
+            & parallelism='[seq]', cray_inline=True)
 
         integer, intent(in) :: j, k, l
         real(wp), intent(inout) :: pS
@@ -451,7 +443,7 @@ contains
             mCP = 0.0_wp; mCPD = 0.0_wp; mCVGP = 0.0_wp; mCVGP2 = 0.0_wp; mQ = 0.0_wp; mQD = 0.0_wp
             ! Those must be updated through the iterations, as they either depend on
             ! the partial masses for all fluids, or on the equilibrium pressure
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
 
                 ! sum of the total alpha*rho*cp of the system
@@ -516,12 +508,8 @@ contains
         !!  @param k generic loop iterator for y direction
         !!  @param l generic loop iterator for z direction
     pure subroutine s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_correct_partial_densities
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_correct_partial_densities', &
+            & parallelism='[seq]', cray_inline=True)
 
         !> @name variables for the correction of the reacting partial densities
         !> @{
@@ -579,12 +567,8 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param TJac Transpose of the Jacobian Matrix
     pure subroutine s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_jacobian_matrix
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_jacobian_matrix', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
         integer, intent(in) :: j
@@ -686,12 +670,8 @@ contains
         !!  @param rhoe mixture energy
         !!  @param R2D (2D) residue array
     pure subroutine s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_pTg_residue
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_pTg_residue', &
+            & parallelism='[seq]', cray_inline=True)
 
         integer, intent(in) :: j, k, l
         real(wp), intent(in) :: mCPD, mCVGP, mQD
@@ -737,12 +717,8 @@ contains
         !!  @param TSat Saturation Temperature
         !!  @param TSIn equilibrium Temperature
     pure elemental subroutine s_TSat(pSat, TSat, TSIn)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_TSat
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_TSat',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), intent(in) :: pSat
         real(wp), intent(out) :: TSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 067ae58142..08034d3106 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -116,12 +116,8 @@ contains
         !! @param stress Shear Stress
         !! @param mom Momentum
     subroutine s_compute_pressure(energy, alf, dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T, stress, mom, G, pres_mag)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_pressure
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), intent(in) :: energy, alf
         real(wp), intent(in) :: dyn_p
@@ -458,11 +454,9 @@ contains
                                                                gamma_K, pi_inf_K, qv_K, &
                                                                alpha_K, alpha_rho_K, Re_K, &
                                                                G_K, G)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', &
+            & parallelism='[seq]', cray_inline=True)
+
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
 
@@ -539,11 +533,8 @@ contains
     pure subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, &
                                                                        gamma_K, pi_inf_K, qv_K, &
                                                                        alpha_K, alpha_rho_K, Re_K)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_bubbles_acc', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
 
@@ -748,7 +739,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -781,7 +772,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -877,7 +868,7 @@ contains
                 do j = ibounds(1)%beg, ibounds(1)%end
                     dyn_pres_K = 0._wp
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
                         alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l)
                         alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -921,13 +912,13 @@ contains
                         B2 = B(1)**2 + B(2)**2 + B(3)**2
 
                         m2 = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, momxe
                             m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2
                         end do
 
                         S = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 3
                             S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i)
                         end do
@@ -935,14 +926,14 @@ contains
                         E = qK_cons_vf(E_idx)%sf(j, k, l)
 
                         D = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             D = D + qK_cons_vf(i)%sf(j, k, l)
                         end do
 
                         ! Newton-Raphson
                         W = E + D
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do iter = 1, relativity_cons_to_prim_max_iter
                             Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2)))
                             pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS
@@ -968,13 +959,13 @@ contains
                         qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2)
 
                         ! Recover the other primitive variables
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 3
                             qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2)
                         end do
                         qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -984,22 +975,22 @@ contains
 
                     if (chemistry) then
                         rho_K = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l))
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = rho_K
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K)
                         end do
                     else
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -1009,7 +1000,7 @@ contains
                     rho_K = max(rho_K, sgm_eps)
 #endif
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, momxe
                         if (model_eqns /= 4) then
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) &
@@ -1023,7 +1014,7 @@ contains
                     end do
 
                     if (chemistry) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_species
                             rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l)
                         end do
@@ -1053,7 +1044,7 @@ contains
                     end if
 
                     if (bubbles_euler) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, nb
                             nRtmp(i) = qK_cons_vf(bubrs(i))%sf(j, k, l)
                         end do
@@ -1065,7 +1056,7 @@ contains
                             nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
                             !Convert cons to prim
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1082,7 +1073,7 @@ contains
                                 call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight)
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1090,21 +1081,21 @@ contains
                     end if
 
                     if (mhd) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
                     end if
 
                     if (elasticity) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = strxb, strxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
                     if (hypoelasticity) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = strxb, strxe
                             ! subtracting elastic contribution for pressure calculation
                             if (G_K > verysmall) then
@@ -1121,13 +1112,13 @@ contains
                     end if
 
                     if (hyperelasticity) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = xibeg, xiend
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = advxb, advxe
                         qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                     end do
@@ -1475,22 +1466,22 @@ contains
             do k = is2b, is2e
                 do j = is1b, is1e
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, contxe
                         alpha_rho_K(i) = qK_prim_vf(j, k, l, i)
                     end do
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = advxb, advxe
                         alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i)
                     end do
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         vel_K(i) = qK_prim_vf(j, k, l, contxe + i)
                     end do
 
                     vel_K_sum = 0._wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         vel_K_sum = vel_K_sum + vel_K(i)**2._wp
                     end do
@@ -1511,7 +1502,7 @@ contains
                     ! Computing the energy from the pressure
 
                     if (chemistry) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i)
                         end do
@@ -1528,12 +1519,12 @@ contains
                     end if
 
                     ! mass flux, this should be \alpha_i \rho_i u_i
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, contxe
                         FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1))
                     end do
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         FK_vf(j, k, l, contxe + dir_idx(i)) = &
                             rho_K*vel_K(dir_idx(1)) &
@@ -1546,14 +1537,14 @@ contains
 
                     ! Species advection Flux, \rho*u*Y
                     if (chemistry) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_species
                             FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i))
                         end do
                     end if
 
                     if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = 0._wp
                             FK_src_vf(j, k, l, i) = alpha_K(i - E_idx)
@@ -1561,12 +1552,12 @@ contains
 
                     else
                         ! Could be bubbles_euler!
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx)
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_src_vf(j, k, l, i) = vel_K(dir_idx(1))
                         end do
@@ -1603,11 +1594,9 @@ contains
 
 #ifndef MFC_PRE_PROCESS
     pure subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c_c, c)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_speed_of_sound
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_speed_of_sound', &
+            & parallelism='[seq]', cray_inline=True)
+
 
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf
@@ -1640,7 +1629,7 @@ contains
                 c = (1._wp/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
             elseif (model_eqns == 3) then
                 c = 0._wp
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = 1, num_fluids
                     c = c + adv(q)*(1._wp/gammas(q) + 1._wp)* &
                         (pres + pi_infs(q)/(gammas(q) + 1._wp))
@@ -1673,11 +1662,9 @@ contains
 
 #ifndef MFC_PRE_PROCESS
     pure subroutine s_compute_fast_magnetosonic_speed(rho, c, B, norm, c_fast, h)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_fast_magnetosonic_speed', &
+            & parallelism='[seq]', cray_inline=True)
+
 
         real(wp), intent(in) :: B(3), rho, c
         real(wp), intent(in) :: h ! only used for relativity
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index b0ed99390b..047184e4e7 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -1,7 +1,7 @@
 #:def arithmetic_avg()
     rho_avg = 5e-1_wp*(rho_L + rho_R)
     vel_avg_rms = 0._wp
-    $:GPU_LOOP()
+    $:GPU_LOOP(parallelism='[seq]')
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
     end do
@@ -17,7 +17,7 @@
 
     vel_avg_rms = 0._wp
 
-    $:GPU_LOOP()
+    $:GPU_LOOP(parallelism='[seq]')
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (sqrt(rho_L)*vel_L(i) + sqrt(rho_R)*vel_R(i))**2._wp/ &
                       (sqrt(rho_L) + sqrt(rho_R))**2._wp
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index cdaf92828a..cd22c96e22 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -230,7 +230,7 @@ contains
                 B_tait = 0._wp
                 small_gamma = 0._wp
 
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = 1, num_fluids
                     myalpha_rho(q) = q_cons_vf(q)%sf(j, k, l)
                     myalpha(q) = q_cons_vf(advxb + q - 1)%sf(j, k, l)
@@ -238,7 +238,7 @@ contains
 
                 if (bubbles_euler) then
                     if (num_fluids > 2) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, num_fluids - 1
                             myRho = myRho + myalpha_rho(q)
                             B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -252,7 +252,7 @@ contains
                 end if
 
                 if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, num_fluids
                         myRho = myRho + myalpha_rho(q)
                         B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -323,11 +323,11 @@ contains
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = contxb, contxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l)
                     end do
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = momxb, momxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l)
                     end do
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 7cf1c80f11..3a9b017d1c 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -465,11 +465,9 @@ contains
                                    fntait, fBtait, f_bub_adv_src, f_divu, &
                                    bub_id, fmass_v, fmass_n, fbeta_c, &
                                    fbeta_t, fCson, adap_dt_stop)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_advance_step
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_advance_step',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -599,11 +597,9 @@ contains
     pure subroutine s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, &
                                         fntait, fBtait, f_bub_adv_src, f_divu, &
                                         fCson, h)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_initial_substep_h
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_initial_substep_h',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(IN) :: fCson
@@ -685,11 +681,9 @@ contains
                                       bub_id, fmass_v, fmass_n, fbeta_c, &
                                       fbeta_t, fCson, h, &
                                       myR_tmp, myV_tmp, myPb_tmp, myMv_tmp)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_advance_substep
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_advance_substep',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu, h
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index c7d85c90f0..fd6a8e14c3 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -81,7 +81,7 @@ contains
             do k = 0, n
                 do j = 0, m
                     nR3bar = 0._wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp
                     end do
@@ -178,7 +178,7 @@ contains
                 do j = 0, m
                     bub_adv_src(j, k, l) = 0._wp
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, nb
                         bub_r_src(j, k, l, q) = 0._wp
                         bub_v_src(j, k, l, q) = 0._wp
@@ -200,7 +200,7 @@ contains
                     if (adv_n) then
                         nbub = q_prim_vf(n_idx)%sf(j, k, l)
                     else
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l)
                             Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l)
@@ -208,7 +208,7 @@ contains
 
                         R3 = 0._wp
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             R3 = R3 + weight(q)*Rtmp(q)**3._wp
                         end do
@@ -219,7 +219,7 @@ contains
                     if (.not. adap_dt) then
                         R2Vav = 0._wp
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q)
                         end do
@@ -227,10 +227,10 @@ contains
                         bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav
                     end if
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, nb
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do ii = 1, num_fluids
                             myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l)
                             myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l)
@@ -241,14 +241,14 @@ contains
                         B_tait = 0._wp
 
                         if (mpp_lim .and. (num_fluids > 2)) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do ii = 1, num_fluids
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
                                 B_tait = B_tait + myalpha(ii)*pi_infs(ii)
                             end do
                         else if (num_fluids > 2) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do ii = 1, num_fluids - 1
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
@@ -331,7 +331,7 @@ contains
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l)
                         if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = &
                             rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l)
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do k = 1, nb
                             rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k)
                             rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index f573e2879e..20e7515c83 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -575,7 +575,7 @@ contains
             call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2)
 
             ! Obtain liquid density and computing speed of sound from pinf
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
                 myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3))
                 myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3))
@@ -740,11 +740,9 @@ contains
         !! @param pi_inf Liquid stiffness
         !! @param cson Calculated speed of sound
     pure subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_cson_from_pinf
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', &
+            & parallelism='[seq]', cray_inline=True)
+
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
         integer, dimension(3), intent(in) :: cell
@@ -754,7 +752,7 @@ contains
         real(wp), dimension(num_dims) :: vel
         integer :: i
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_dims
             vel(i) = q_prim_vf(i + contxe)%sf(cell(1), cell(2), cell(3))
         end do
@@ -810,11 +808,9 @@ contains
         !! @param cell Bubble cell
         !! @param Romega Control volume radius
     pure subroutine s_get_pinf(bub_id, q_prim_vf, ptype, f_pinfl, cell, preterm1, term2, Romega)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_pinf
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(out) :: f_pinfl
@@ -835,7 +831,7 @@ contains
 
         !< Find current bubble cell
         cell(:) = int(scoord(:))
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_dims
             if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
         end do
@@ -926,11 +922,11 @@ contains
             charpres2 = 0._wp
             vol = 0._wp
 
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, smearGrid
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do j = 1, smearGrid
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do k = 1, smearGridz
                         cellaux(1) = cell(1) + i - (mapCells + 1)
                         cellaux(2) = cell(2) + j - (mapCells + 1)
@@ -1652,7 +1648,7 @@ contains
 
         integer :: i
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = bub_id, nBubs - 1
             lag_id(i, 1) = lag_id(i + 1, 1)
             bub_R0(i) = bub_R0(i + 1)
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index 3d2f1089ac..48ea3bad9a 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -200,11 +200,9 @@ contains
 
     !> The purpose of this subroutine is to apply the gaussian kernel function for each bubble (Maeda and Colonius, 2018)).
     pure subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_applygaussian
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
         real(wp), dimension(3), intent(in) :: nodecoord
@@ -270,11 +268,9 @@ contains
             !! @param cellaux Tested cell to smear the bubble effect in.
             !! @param celloutside If true, then cellaux is outside the computational domain.
     pure subroutine s_check_celloutside(cellaux, celloutside)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_check_celloutside
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
 
@@ -306,11 +302,9 @@ contains
             !! @param cell Cell of the current bubble
             !! @param cellaux Cell to map the bubble effect in.
     pure subroutine s_shift_cell_symmetric_bc(cellaux, cell)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_shift_cell_symmetric_bc', &
+            & parallelism='[seq]', cray_inline=True)
+
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
 
@@ -347,11 +341,9 @@ contains
             !! @param volpart Volume of the bubble
             !! @param stddsv Standard deviaton
     pure subroutine s_compute_stddsv(cell, volpart, stddsv)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_stddsv
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
         real(wp), intent(out) :: stddsv
@@ -388,11 +380,9 @@ contains
             !! @param cell Computational coordinates (x, y, z)
             !! @param Charvol Characteristic volume
     pure elemental subroutine s_get_char_vol(cellx, celly, cellz, Charvol)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_char_vol
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
 
@@ -413,11 +403,9 @@ contains
             !! @param s Computational coordinates of the bubble, real type
             !! @param get_cell Computational coordinates of the bubble, integer type
     pure subroutine s_get_cell(s_cell, get_cell)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_cell
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
         integer :: i
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 1062130a7d..557ad6d1ae 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -779,25 +779,25 @@ contains
                     do k = is2%beg, is2%end
 
                         ! Transferring the Primitive Variables
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i)
                         end do
 
                         vel_K_sum = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel_K_sum = vel_K_sum + vel(i)**2._wp
                         end do
 
                         pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx)
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, advxe - E_idx
                             adv(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i)
                         end do
@@ -808,13 +808,13 @@ contains
                             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc)
                         end if
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             mf(i) = alpha_rho(i)/rho
                         end do
 
                         if (chemistry) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = chemxb, chemxe
                                 Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                             end do
@@ -847,39 +847,39 @@ contains
 
                         ! First-Order Spatial Derivatives of Primitive Variables
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             dalpha_rho_ds(i) = 0._wp
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             dvel_ds(i) = 0._wp
                         end do
 
                         dpres_ds = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, advxe - E_idx
                             dadv_ds(i) = 0._wp
                         end do
 
                         if (chemistry) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 dYs_ds(i) = 0._wp
                             end do
                         end if
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do j = 0, buff_size
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, contxe
                                 dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* &
                                                    fd_coef_${XYZ}$ (j, cbc_loc) + &
                                                    dalpha_rho_ds(i)
                             end do
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_dims
                                 dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -889,7 +889,7 @@ contains
                             dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* &
                                        fd_coef_${XYZ}$ (j, cbc_loc) + &
                                        dpres_ds
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -897,7 +897,7 @@ contains
                             end do
 
                             if (chemistry) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_species
                                     dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* &
                                                 fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -924,7 +924,7 @@ contains
                             call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
                             ! Add GRCBC for Subsonic Inflow
                             if (bc_${XYZ}$%grcbc_in) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 2, momxb
                                     L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -934,7 +934,7 @@ contains
                                         L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$)
                                     end if
                                 end if
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = E_idx, advxe - 1
                                     L(i) = c*Ma*(adv(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -974,13 +974,13 @@ contains
                             dpres_dt = -5e-1_wp*(L(advxe) + L(1))
                         end if
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             dalpha_rho_dt(i) = &
                                 -(L(i + 1) - mf(i)*dpres_dt)/(c*c)
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* &
                                                   (L(1) - L(advxe))/(2._wp*rho*c) + &
@@ -989,13 +989,13 @@ contains
                         end do
 
                         vel_dv_dt_sum = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i)
                         end do
 
                         if (chemistry) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 dYs_dt(i) = -1._wp*L(chemxb + i - 1)
                             end do
@@ -1003,12 +1003,12 @@ contains
 
                         ! The treatment of void fraction source is unclear
                         if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i) !+ adv(i) * vel(dir_idx(1))/y_cc(n)
                             end do
                         else
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i)
                             end do
@@ -1021,7 +1021,7 @@ contains
                             dgamma_dt = dadv_dt(1)
                             dpi_inf_dt = dadv_dt(2)
                         else
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 drho_dt = drho_dt + dalpha_rho_dt(i)
                                 dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i)
@@ -1031,13 +1031,13 @@ contains
                         end if
 
                         ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*dalpha_rho_dt(i)
                         end do
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, momxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*(vel(i - contxe)*drho_dt &
@@ -1048,14 +1048,14 @@ contains
                             ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990
                             call get_species_enthalpies_rt(T, h_k)
                             sum_Enthalpies = 0._wp
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
                                 sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
                             end do
                             flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) &
                                                                    + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies)
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) &
                                                                                 + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i))
@@ -1071,12 +1071,12 @@ contains
                         end if
 
                         if (riemann_solver == 1) then
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp
                             end do
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = &
                                     1._wp/max(abs(vel(dir_idx(1))), sgm_eps) &
@@ -1089,13 +1089,13 @@ contains
 
                         else
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + &
                                                                    ds(0)*dadv_dt(i - E_idx)
                             end do
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i)
                             end do
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index de6af78c45..b73351b606 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -85,11 +85,9 @@ contains
 
     !> Slip wall CBC (Thompson 1990, pg. 451)
     pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_slip_wall_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -103,11 +101,9 @@ contains
 
     !> Nonreflecting subsonic buffer CBC (Thompson 1987, pg. 13)
     pure subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -133,11 +129,9 @@ contains
 
     !> Nonreflecting subsonic inflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -150,11 +144,9 @@ contains
 
     !> Nonreflecting subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -174,11 +166,9 @@ contains
 
     !> Force-free subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -196,11 +186,9 @@ contains
 
     !> Constant pressure subsonic outflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -218,11 +206,9 @@ contains
 
     !> Supersonic inflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_inflow_L(L)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(sys_size), intent(inout) :: L
         L(1:advxe) = 0._wp
         if (chemistry) L(chemxb:chemxe) = 0._wp
@@ -230,11 +216,9 @@ contains
 
     !> Supersonic outflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 806765c981..06610a5977 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -55,7 +55,7 @@ contains
         @:ACC_SETUP_VFs(btensor)
 
         @:ALLOCATE(Gs(1:num_fluids))
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
@@ -111,7 +111,7 @@ contains
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
                         alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l)
                         alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -124,7 +124,7 @@ contains
                     !if ( G <= verysmall ) G_K = 0_wp
 
                     if (G > verysmall) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, tensor_size
                             tensora(i) = 0._wp
                         end do
@@ -133,7 +133,7 @@ contains
                         ! number for the tensor 1-3:  dxix_dx, dxiy_dx, dxiz_dx
                         ! 4-6 :                       dxix_dy, dxiy_dy, dxiz_dy
                         ! 7-9 :                       dxix_dz, dxiy_dz, dxiz_dz
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             ! derivatives in the x-direction
                             tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x(r, j)
@@ -167,7 +167,7 @@ contains
                         if (tensorb(tensor_size) > verysmall) then
                             ! STEP 2c: computing the inverse of grad_xi tensor = F
                             ! tensorb is the adjoint, tensora becomes F
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, tensor_size - 1
                                 tensora(i) = tensorb(i)/tensorb(tensor_size)
                             end do
@@ -198,7 +198,7 @@ contains
                             q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - &
                                                            G*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma
                             ! STEP 5c: updating the Cauchy stress conservative scalar field
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, b_size - 1
                                 q_cons_vf(strxb + i - 1)%sf(j, k, l) = &
                                     rho*q_prim_vf(strxb + i - 1)%sf(j, k, l)
@@ -238,7 +238,7 @@ contains
         #:endfor
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
@@ -279,7 +279,7 @@ contains
 
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 03a15d5471..3f736b0b0b 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -117,7 +117,7 @@ contains
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             du_dx(k, l, q) = du_dx(k, l, q) &
                                              + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
@@ -141,7 +141,7 @@ contains
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do r = -fd_number, fd_number
                                 du_dy(k, l, q) = du_dy(k, l, q) &
                                                  + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
@@ -171,7 +171,7 @@ contains
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do r = -fd_number, fd_number
                                     du_dz(k, l, q) = du_dz(k, l, q) &
                                                      + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index a156efd94f..67b083584b 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -207,7 +207,7 @@ contains
             dyn_pres = 0._wp
 
             ! Set q_prim_vf params at GP so that mixture vars calculated properly
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -243,7 +243,7 @@ contains
             end if
 
             ! Set momentum
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -251,7 +251,7 @@ contains
             end do
 
             ! Set continuity and adv vars
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do q = 1, num_fluids
                 q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -305,7 +305,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = intxb, intxe
                     q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP &
                                                                         + pi_infs(q - intxb + 1))
@@ -333,7 +333,7 @@ contains
                 physical_loc = [x_cc(j), y_cc(k), 0._wp]
             end if
 
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -348,7 +348,7 @@ contains
 
             dyn_pres = 0._wp
 
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -817,11 +817,11 @@ contains
             end if
         end if
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = i1, i2
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do j = j1, j2
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do k = k1, k2
 
                     coeff = gp%interp_coeffs(i - i1 + 1, j - j1 + 1, k - k1 + 1)
@@ -829,13 +829,13 @@ contains
                     pres_IP = pres_IP + coeff* &
                               q_prim_vf(E_idx)%sf(i, j, k)
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = momxb, momxe
                         vel_IP(q + 1 - momxb) = vel_IP(q + 1 - momxb) + coeff* &
                                                 q_prim_vf(q)%sf(i, j, k)
                     end do
 
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do l = contxb, contxe
                         alpha_rho_IP(l) = alpha_rho_IP(l) + coeff* &
                                           q_prim_vf(l)%sf(i, j, k)
@@ -848,7 +848,7 @@ contains
                     end if
 
                     if (bubbles_euler .and. .not. qbmm) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do l = 1, nb
                             if (polytropic) then
                                 r_IP(l) = r_IP(l) + coeff*q_prim_vf(bubxb + (l - 1)*2)%sf(i, j, k)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index e6ee464c22..8112b3af7e 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -82,16 +82,16 @@ contains
                 do k = 0, m
 
                     divB = 0._wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
                     end do
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
                     end do
                     if (p > 0) then
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
                         end do
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index d3a63ad6f6..df1d22a83e 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -110,7 +110,7 @@ contains
         integer :: i
 
         s_needs_pressure_relaxation = .true.
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
                 s_needs_pressure_relaxation = .false.
@@ -129,7 +129,7 @@ contains
         integer :: i
 
         sum_alpha = 0._wp
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
                 (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
@@ -142,7 +142,7 @@ contains
             sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
         end do
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
         end do
@@ -164,7 +164,7 @@ contains
 
         ! Initialize pressures
         pres_relax = 0._wp
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                 pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
@@ -180,7 +180,7 @@ contains
         ! Newton-Raphson iteration
         f_pres = 1e-9_wp
         df_pres = 1e9_wp
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do iter = 0, MAX_ITER - 1
             if (abs(f_pres) > TOLERANCE) then
                 pres_relax = pres_relax - f_pres/df_pres
@@ -194,7 +194,7 @@ contains
                 ! Newton-Raphson step
                 f_pres = -1._wp
                 df_pres = 0._wp
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                         rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
@@ -210,7 +210,7 @@ contains
         end do
 
         ! Update volume fractions
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
@@ -230,7 +230,7 @@ contains
         real(wp), dimension(2) :: Re
         integer :: i, q
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
             alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
@@ -243,14 +243,14 @@ contains
 
         if (bubbles_euler) then
             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
                     pi_inf = pi_inf + alpha(i)*pi_infs(i)
                 end do
             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids - 1
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
@@ -264,7 +264,7 @@ contains
         else
             sum_alpha = 0._wp
             if (mpp_lim) then
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     alpha_rho(i) = max(0._wp, alpha_rho(i))
                     alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
@@ -273,7 +273,7 @@ contains
                 alpha = alpha/max(sum_alpha, sgm_eps)
             end if
 
-            $:GPU_LOOP()
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
                 rho = rho + alpha_rho(i)
                 gamma = gamma + alpha(i)*gammas(i)
@@ -281,11 +281,11 @@ contains
             end do
 
             if (viscous) then
-                $:GPU_LOOP()
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, 2
                     Re(i) = dflt_real
                     if (Re_size(i) > 0) Re(i) = 0._wp
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, Re_size(i)
                         Re(i) = alpha(Re_idx(i, q))/Res(i, q) + Re(i)
                     end do
@@ -296,7 +296,7 @@ contains
 
         ! Compute dynamic pressure and update internal energies
         dyn_pres = 0._wp
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = momxb, momxe
             dyn_pres = dyn_pres + 5e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
                        q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
@@ -304,7 +304,7 @@ contains
 
         pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 1311afc5d6..40301942b6 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -544,7 +544,7 @@ contains
                     do i = 0, m
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
                         j = bubxb
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do k = 1, nb
                             rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
                             rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l)
@@ -563,11 +563,9 @@ contains
 
     !Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv)
     pure subroutine s_coeff_nonpoly(pres, rho, c, coeffs)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_coeff_nonpoly
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
 
@@ -636,11 +634,9 @@ contains
 
 !Coefficient array for polytropic model (pb for each R0 bin accounted for in wght_pb)
     pure subroutine s_coeff(pres, rho, c, coeffs)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_coeff
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', &
+            & cray_inline=True)
+
 
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -740,10 +736,10 @@ contains
 
                     if (alf > small_alf) then
                         nbub = q_cons_vf(bubxb)%sf(id1, id2, id3)
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             ! Gather moments for this bubble bin
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do r = 2, nmom
                                 moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3)
                             end do
@@ -751,12 +747,12 @@ contains
                             call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q))
 
                             if (polytropic) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     wght_pb(j, q) = wght(j, q)*(pb0(q) - pv)
                                 end do
                             else
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp))
                                     x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw)
@@ -775,13 +771,13 @@ contains
 
                             ! Compute change in moments due to bubble dynamics
                             r = 1
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i2 = 0, 2
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i1 = 0, 2
                                     if ((i1 + i2) <= 2) then
                                         momsum = 0._wp
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do j = 1, nterms
                                             select case (bubble_model)
                                             case (3)
@@ -811,7 +807,7 @@ contains
 
                             ! Compute change in pb and mv for non-polytropic model
                             if (.not. polytropic) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     drdt = msum(2)
                                     drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp)))
@@ -839,11 +835,11 @@ contains
                             end if
                         end if
                     else
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i1 = 0, 2
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i2 = 0, 2
                                     moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp
                                 end do
@@ -861,11 +857,9 @@ contains
     contains
         ! Helper to select the correct coefficient routine
         subroutine s_coeff_selector(pres, rho, c, coeff, polytropic)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_chyqmom
-#else
-            $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+            $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', &
+                & cray_inline=True)
+
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
             logical, intent(in) :: polytropic
@@ -877,11 +871,9 @@ contains
         end subroutine s_coeff_selector
 
         pure subroutine s_chyqmom(momin, wght, abscX, abscY)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_chyqmom
-#else
-            $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+            $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', &
+                & cray_inline=True)
+
             real(wp), dimension(nmom), intent(in) :: momin
             real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
 
@@ -937,11 +929,9 @@ contains
         end subroutine s_chyqmom
 
         pure subroutine s_hyqmom(frho, fup, fmom)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_hyqmom
-#else
-            $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+            $:GPU_ROUTINE(function_name='s_hyqmom',parallelism='[seq]', &
+                & cray_inline=True)
+
             real(wp), dimension(2), intent(inout) :: frho, fup
             real(wp), dimension(3), intent(in) :: fmom
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 8e7ef5f628..acdfa769c3 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -626,11 +626,11 @@ contains
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         alf_sum%sf(j, k, l) = 0._wp
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe - 1
                             alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l)
                         end do
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe - 1
                             q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) &
                                                           /alf_sum%sf(j, k, l)
@@ -1429,7 +1429,7 @@ contains
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* &
@@ -1478,7 +1478,7 @@ contains
                     $:GPU_PARALLEL_LOOP(collapse=2)
                     do l = 0, p
                         do j = 0, m
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, 0, l) = &
                                     rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* &
@@ -1494,7 +1494,7 @@ contains
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1510,7 +1510,7 @@ contains
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1531,7 +1531,7 @@ contains
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1546,7 +1546,7 @@ contains
                         $:GPU_PARALLEL_LOOP(collapse=2)
                         do l = 0, p
                             do j = 0, m
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, 0, l) = &
                                         rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* &
@@ -1561,7 +1561,7 @@ contains
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5e-1_wp/y_cc(k)* &
@@ -1596,7 +1596,7 @@ contains
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* &
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index eec1e9fafd..3a46f629d2 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -367,13 +367,13 @@ contains
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, contxe
                                 alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                 alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                             end do
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_vels
                                 vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                 vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -381,13 +381,13 @@ contains
 
                             vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_vels
                                 vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                 vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                             end do
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                 alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -431,7 +431,7 @@ contains
                             pres_mag%R = 0._wp
 
                             if (mpp_lim) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
                                     alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
@@ -440,7 +440,7 @@ contains
 
                                 alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
                                     alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
@@ -450,7 +450,7 @@ contains
                                 alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_L = rho_L + alpha_rho_L(i)
                                 gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -464,13 +464,13 @@ contains
                             end do
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_L(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res(i, q) &
                                                   + Re_L(i)
@@ -480,13 +480,13 @@ contains
 
                                 end do
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_R(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_R(i) = alpha_R(Re_idx(i, q))/Res(i, q) &
                                                   + Re_R(i)
@@ -497,7 +497,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = chemxb, chemxe
                                     Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -594,7 +594,7 @@ contains
                             if (hypoelasticity) then
                                 G_L = 0._wp; G_R = 0._wp
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     G_L = G_L + alpha_L(i)*Gs(i)
                                     G_R = G_R + alpha_R(i)*Gs(i)
@@ -627,7 +627,7 @@ contains
                             !    G_L = 0._wp
                             !    G_R = 0._wp
                             !
-                            !    $:GPU_LOOP()
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, num_fluids
                             !        G_L = G_L + alpha_L(i)*Gs(i)
                             !        G_R = G_R + alpha_R(i)*Gs(i)
@@ -636,17 +636,17 @@ contains
                             !    if ((G_L > 1e-3_wp) .and. (G_R > 1e-3_wp)) then
                             !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                             !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                            !    $:GPU_LOOP()
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                             !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                             !    end do
-                            !    $:GPU_LOOP()
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = 0._wp
                             !        tau_e_R(i) = 0._wp
                             !    end do
-                            !    $:GPU_LOOP()
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, num_dims
                             !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                             !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
@@ -674,7 +674,7 @@ contains
                             end if
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                 end do
@@ -753,7 +753,7 @@ contains
 
                             ! Mass
                             if (.not. relativity) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -763,7 +763,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (relativity) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -844,7 +844,7 @@ contains
                                      + s_M*s_P*(cm%L(3) - cm%R(3))) &
                                     /(s_M - s_P)
                             elseif (bubbles_euler) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -859,7 +859,7 @@ contains
                                         + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
                                 end do
                             else if (hypoelasticity) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -875,7 +875,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             else
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -971,7 +971,7 @@ contains
                             end if
 
                             ! Advection
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = &
                                     (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -996,7 +996,7 @@ contains
                             !end if
 
                             ! Div(U)?
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_vels
                                 vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                     (xi_M*(rho_L*vel_L(dir_idx(i))* &
@@ -1017,7 +1017,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = chemxb, chemxe
                                     Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -1067,7 +1067,7 @@ contains
                             #:if (NORM_DIR == 2)
                                 if (cyl_coord) then
                                     !Substituting the advective flux into the inviscid geometrical source flux
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, E_idx
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1076,7 +1076,7 @@ contains
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
                                         - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
                                     ! Geometrical source of the void fraction(s) is zero
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = advxb, advxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1089,7 +1089,7 @@ contains
                                         (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
                                         /(s_M - s_P)
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = strxb, strxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1301,7 +1301,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -1326,32 +1326,32 @@ contains
                                 alpha_R_sum = 0._wp
 
                                 if (mpp_lim) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1368,13 +1368,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -1384,13 +1384,13 @@ contains
 
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -1406,18 +1406,18 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0._wp; G_R = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -1434,13 +1434,13 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0._wp; G_R = 0._wp; 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -1451,7 +1451,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -1475,7 +1475,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -1570,7 +1570,7 @@ contains
 
                                 ! COMPUTING FLUXES
                                 ! MASS FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + &
@@ -1579,7 +1579,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* &
@@ -1595,7 +1595,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0._wp; 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -1613,7 +1613,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
@@ -1621,7 +1621,7 @@ contains
                                 end do
 
                                 ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -1631,7 +1631,7 @@ contains
 
                                 ! INTERNAL ENERGIES ADVECTION FLUX.
                                 ! K-th pressure and velocity in preparation for the internal energy flux
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* &
                                                             xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + &
@@ -1650,7 +1650,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -1660,7 +1660,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -1681,11 +1681,11 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = intxb, intxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1693,7 +1693,7 @@ contains
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1701,7 +1701,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1726,26 +1726,26 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                 end do
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                     vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                                 end do
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1758,7 +1758,7 @@ contains
                                 gamma_L = 0._wp
                                 pi_inf_L = 0._wp
                                 qv_L = 0._wp
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + alpha_rho_L(i)
                                     gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -1770,7 +1770,7 @@ contains
                                 gamma_R = 0._wp
                                 pi_inf_R = 0._wp
                                 qv_R = 0._wp
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_R = rho_R + alpha_rho_R(i)
                                     gamma_R = gamma_R + alpha_R(i)*gammas(i)
@@ -1845,7 +1845,7 @@ contains
                                 xi_M = (5e-1_wp + sign(5e-1_wp, s_S))
                                 xi_P = (5e-1_wp - sign(5e-1_wp, s_S))
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*alpha_rho_L(i) &
@@ -1856,7 +1856,7 @@ contains
 
                                 ! Momentum flux.
                                 ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -1875,7 +1875,7 @@ contains
 
                                 if (bubbles_euler) then
                                     ! Put p_tilde in
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                             flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
@@ -1886,7 +1886,7 @@ contains
 
                                 flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = alf_idx, alf_idx !only advect the void fraction
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1896,7 +1896,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
 
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
@@ -1907,7 +1907,7 @@ contains
 
                                 ! Add advection flux for bubble variables
                                 if (bubbles_euler) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = bubxb, bubxe
                                         flux_rs${XYZ}$_vf(j, k, l, i) = &
                                             xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1922,7 +1922,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1939,7 +1939,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1947,7 +1947,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1980,7 +1980,7 @@ contains
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1988,7 +1988,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2006,7 +2006,7 @@ contains
 
                                 ! Retain this in the refactor
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2014,7 +2014,7 @@ contains
                                         qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2034,7 +2034,7 @@ contains
                                 qv_R = 0._wp
 
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2042,7 +2042,7 @@ contains
                                         qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -2058,13 +2058,13 @@ contains
 
                                 if (viscous) then
                                     if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, 2
                                             Re_L(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                            $:GPU_LOOP()
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do q = 1, Re_size(i)
                                                 Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_L(i)
@@ -2074,13 +2074,13 @@ contains
 
                                         end do
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, 2
                                             Re_R(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                            $:GPU_LOOP()
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do q = 1, Re_size(i)
                                                 Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_R(i)
@@ -2099,7 +2099,7 @@ contains
                                 H_R = (E_R + pres_R)/rho_R
 
                                 if (avg_state == 2) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, nb
                                         R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
                                         R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
@@ -2119,7 +2119,7 @@ contains
                                         else
                                             nbub_L_denom = 0._wp
                                             nbub_R_denom = 0._wp
-                                            $:GPU_LOOP()
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do i = 1, nb
                                                 nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i)
                                                 nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i)
@@ -2133,7 +2133,7 @@ contains
                                         nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
                                     end if
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, nb
                                         if (.not. qbmm) then
                                             if (polytropic) then
@@ -2166,7 +2166,7 @@ contains
                                         R3V2Lbar = 0._wp
                                         R3V2Rbar = 0._wp
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, nb
                                             PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
                                             PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
@@ -2201,7 +2201,7 @@ contains
                                     gamma_avg = 5e-1_wp*(gamma_L + gamma_R)
                                     vel_avg_rms = 0._wp
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         vel_avg_rms = vel_avg_rms + (5e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
                                     end do
@@ -2220,7 +2220,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2284,7 +2284,7 @@ contains
                                     pcorr = 0._wp
                                 end if
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2303,7 +2303,7 @@ contains
 
                                 ! Include p_tilde
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -2335,7 +2335,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
 
                                 ! Volume fraction flux
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2345,7 +2345,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                         xi_M*(vel_L(dir_idx(i)) + &
@@ -2361,7 +2361,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
 
                                 ! Add advection flux for bubble variables
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = bubxb, bubxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2390,7 +2390,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2407,7 +2407,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2415,7 +2415,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2455,14 +2455,14 @@ contains
 
                                 !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2489,32 +2489,32 @@ contains
                                 ! Change this by splitting it into the cases
                                 ! present in the bubbles_euler
                                 if (mpp_lim) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2528,13 +2528,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -2544,13 +2544,13 @@ contains
 
                                     end do
 
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -2562,7 +2562,7 @@ contains
 
                                 if (chemistry) then
                                     c_sum_Yi_Phi = 0.0_wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = chemxb, chemxe
                                         Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2621,19 +2621,19 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0._wp
                                     G_R = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -2650,14 +2650,14 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0._wp
                                     G_R = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -2668,7 +2668,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -2692,7 +2692,7 @@ contains
                                                               vel_avg_rms, c_sum_Yi_Phi, c_avg)
 
                                 if (viscous) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2768,7 +2768,7 @@ contains
 
                                 ! COMPUTING THE HLLC FLUXES
                                 ! MASS FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2779,7 +2779,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = &
@@ -2814,7 +2814,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -2833,7 +2833,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -2842,7 +2842,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2852,7 +2852,7 @@ contains
                                 end do
 
                                 ! VOLUME FRACTION SOURCE FLUX.
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -2875,7 +2875,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -2888,7 +2888,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1)
 
                                 if (chemistry) then
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = chemxb, chemxe
                                         Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2903,7 +2903,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2920,7 +2920,7 @@ contains
                                                                       (1._wp - dir_flg(idx1))* &
                                                                       vel_R(idx1)) - vel_R(idx1))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2928,7 +2928,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        $:GPU_LOOP()
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -3119,7 +3119,7 @@ contains
                             ! Sum properties of all fluid components
                             rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
                             rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho%L = rho%L + alpha_rho_L(i)
                                 gamma%L = gamma%L + alpha_L(i)*gammas(i)
@@ -3289,7 +3289,7 @@ contains
                             ! Energy
                             flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
                             ! Partial fraction
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
                             end do
@@ -3996,7 +3996,7 @@ contains
 
                     ! Average velocities and their derivatives at the interface
                     ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i_vel = 1, num_dims
                         avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
 
@@ -4074,7 +4074,7 @@ contains
                             end if
                         end select
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i_vel = 1, num_dims
                             flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
                             flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index e8efc4be29..b89c3a9c1d 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -31,11 +31,8 @@ contains
         !! @param k y index
         !! @param l z index
     pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_enthalpy
-#else
-        $:GPU_ROUTINE(parallelism='[seq]')
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', &
+            & cray_inline=True)
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
         real(wp), intent(inout), dimension(num_fluids) :: alpha
@@ -49,7 +46,7 @@ contains
 
         integer :: i
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             alpha_rho(i) = q_prim_vf(i)%sf(j, k, l)
             alpha(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
@@ -64,13 +61,13 @@ contains
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re)
         end if
 
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_vels
             vel(i) = q_prim_vf(contxe + i)%sf(j, k, l)
         end do
 
         vel_sum = 0._wp
-        $:GPU_LOOP()
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_vels
             vel_sum = vel_sum + vel(i)**2._wp
         end do
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 16420ab3de..c1a438c355 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -81,7 +81,7 @@ contains
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
-                    $:GPU_LOOP()
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, E_idx
                         tau_Re_vf(i)%sf(j, k, l) = 0._wp
                     end do
@@ -95,7 +95,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -111,14 +111,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -137,7 +137,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -148,7 +148,7 @@ contains
 
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -156,12 +156,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -181,7 +181,7 @@ contains
                                         - 2._wp*grad_x_vf(1)%sf(j, k, l) &
                                         - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
                                        (3._wp*Re_visc(1))
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 2
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -203,7 +203,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -219,14 +219,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -245,7 +245,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -256,7 +256,7 @@ contains
 
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -264,12 +264,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -308,7 +308,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -324,14 +324,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -350,7 +350,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -361,7 +361,7 @@ contains
 
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -369,12 +369,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -394,7 +394,7 @@ contains
                                         y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
                                        Re_visc(1)
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 2, 3
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -417,7 +417,7 @@ contains
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -433,14 +433,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -459,7 +459,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -470,7 +470,7 @@ contains
 
                             end if
 
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -478,12 +478,12 @@ contains
                             end do
 
                             if (viscous) then
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:GPU_LOOP()
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -597,7 +597,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j, k, l) - &
@@ -612,7 +612,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
@@ -629,7 +629,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j, l) - &
@@ -644,7 +644,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
@@ -659,7 +659,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
@@ -678,7 +678,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
@@ -698,7 +698,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
@@ -718,7 +718,7 @@ contains
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
-                            $:GPU_LOOP()
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
@@ -740,7 +740,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -756,7 +756,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -772,7 +772,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -793,7 +793,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -814,7 +814,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -835,7 +835,7 @@ contains
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -856,7 +856,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -877,7 +877,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -897,7 +897,7 @@ contains
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
@@ -917,7 +917,7 @@ contains
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
                                     dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
                                         (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
@@ -1214,7 +1214,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(j)) &
@@ -1242,7 +1242,7 @@ contains
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(k)) &
@@ -1270,7 +1270,7 @@ contains
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
-                        $:GPU_LOOP()
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(l)) &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 2dd5823668..fa740669b7 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -789,7 +789,7 @@ contains
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
@@ -908,7 +908,7 @@ contains
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                $:GPU_LOOP()
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, v_size
 
                                     if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity

From 41c8280b1d9dffea49521c40459597d3eb37a7d8 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 19 Jun 2025 18:10:50 -0400
Subject: [PATCH 63/75] Combined end directives with the beginning directive

---
 src/common/include/parallel_macros.fpp | 28 ++++++++++++++++++--------
 src/simulation/m_fftw.fpp              | 20 +++++++++---------
 src/simulation/m_weno.fpp              |  9 ++++++---
 3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 87868d1285..c79a05e0f3 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -132,6 +132,7 @@
 
 #:def GEN_COLLAPSE_STR(collapse)
     #:if collapse is not None
+        #:set collapse = int(collapse)
         #:assert isinstance(collapse, int)
         #:assert collapse > 1
         #:set collapse_val = 'collapse(' + str(collapse) + ') '
@@ -327,8 +328,13 @@
     $:acc_directive
 #:enddef
 
-#:def GPU_DATA(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
-    
+#:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
+    #:assert code is not None
+    #:assert isinstance(code, str)
+    #:if code == '' or code.isspace()
+        #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code'
+    #:endif
+
     #:set copy_val = GEN_COPY_STR(copy)
 
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
@@ -355,21 +361,27 @@
         & deviceptr_val.strip('\n') + attach_val.strip('\n') + &
         & default_val.strip('\n')
     #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end data'
     $:acc_directive
+    $:code
+    $:end_acc_directive
 #:enddef
 
-#:def GPU_HOST_DATA(use_device=None, extraAccArgs=None)
+#:def GPU_HOST_DATA(code, use_device=None, extraAccArgs=None)
+    #:assert code is not None
+    #:assert isinstance(code, str)
+    #:if code == '' or code.isspace()
+        #:stop 'GPU_HOST_DATA macro has no effect on the code as it is not surrounding any code'
+    #:endif
     #:set use_device_val = GEN_USE_DEVICE_STR(use_device)
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
 
     #:set clause_val = use_device_val.strip('\n')
     #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end host_data'
     $:acc_directive
-#:enddef
-
-#:def GPU_END_HOST_DATA()
-    #:set acc_directive = '!$acc end host_data'
-    $:acc_directive
+    $:code
+    $:end_acc_directive
 #:enddef
 
 #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 8c64193e28..425e1be060 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -161,15 +161,15 @@ contains
         p_cmplx => data_cmplx_gpu
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
-        $:GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
-        $:GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
+        #:call GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
+        #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
         ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
         call hipCheck(hipDeviceSynchronize())
 #endif
-        $:GPU_END_HOST_DATA()
+        #:endcall GPU_HOST_DATA
         Nfq = 3
         $:GPU_UPDATE(device='[Nfq]')
 
@@ -182,14 +182,14 @@ contains
             end do
         end do
 
-        $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+        #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
         ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
         ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
         call hipCheck(hipDeviceSynchronize())
 #endif
-        $:GPU_END_HOST_DATA()
+        #:endcall GPU_HOST_DATA
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
@@ -221,14 +221,14 @@ contains
                 end do
             end do
 
-            $:GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
+            #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
             ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
             call hipCheck(hipDeviceSynchronize())
 #endif
-            $:GPU_END_HOST_DATA()
+            #:endcall GPU_HOST_DATA
 
             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
             $:GPU_UPDATE(device='[Nfq]')
@@ -242,14 +242,14 @@ contains
                 end do
             end do
 
-            $:GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+            #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
             ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
             call hipCheck(hipDeviceSynchronize())
 #endif
-            $:GPU_END_HOST_DATA()
+            #:endcall GPU_HOST_DATA
 
             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
             do k = 1, sys_size
@@ -293,7 +293,7 @@ contains
             end do
         end do
 #endif
-!$acc end data
+#:endcall GPU_DATA
     end subroutine s_apply_fourier_filter
 
     !>  The purpose of this subroutine is to destroy the fftw plan
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index fa740669b7..f7eeec4a7a 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1148,15 +1148,17 @@ contains
                     block
                         use CuTensorEx
 
-                        $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
+                        #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
+                        #:endcall GPU_HOST_DATA
                     end block
                 else
                     block
                         use CuTensorEx
 
-                        $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
+                        #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
                         v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
+                        #:endcall GPU_HOST_DATA
                     end block
                 end if
             else
@@ -1184,8 +1186,9 @@ contains
                 block
                     use CuTensorEx
 
-                    $:GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
+                    #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
                     v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
+                    #:endcall
                 end block
             else
 #endif

From bfefe6aa345b8c217702681c8d4fc51c9663ef00 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 19 Jun 2025 18:52:45 -0400
Subject: [PATCH 64/75] Removed whitespace in macros, added mute, and ran
 formatter

---
 src/common/include/macros.fpp          |   2 +-
 src/common/include/parallel_macros.fpp |  70 +----------
 src/common/m_boundary_common.fpp       |   2 +-
 src/common/m_variables_conversion.fpp  |   3 -
 src/simulation/m_fftw.fpp              | 168 ++++++++++++-------------
 src/simulation/m_qbmm.fpp              |   1 -
 src/simulation/m_weno.fpp              |   6 +-
 7 files changed, 94 insertions(+), 158 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 759ee24175..fea730cbd1 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -106,4 +106,4 @@
                          //${message or '"No error description."'}$)
     end if
 #:enddef
-! New line at end of file is required for FYPP
\ No newline at end of file
+! New line at end of file is required for FYPP
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index c79a05e0f3..5b0b8bf564 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -1,3 +1,5 @@
+#:mute
+
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
     #:assert isinstance(data, list)
@@ -7,7 +9,6 @@
 
 #:def GEN_PARENTHESES_CLAUSE(clause_name, clause_str)
     #:set clause_regex = re.compile(',(?![^(]*\\))')
-
     #:assert isinstance(clause_name, str)
     #:if clause_str is not None
         #:set count = 0
@@ -203,35 +204,20 @@
     & default='present', firstprivate=None, reduction=None, reductionOp=None, &
     & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
     & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
-
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
-    
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
     #:set default_val = GEN_DEFAULT_STR(default)
-
     #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
-
     #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
-
     #:set copy_val = GEN_COPY_STR(copy)
-
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-    
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
     #:set create_val = GEN_CREATE_STR(create)
-
     #:set no_create_val = GEN_NOCREATE_STR(no_create)
-
     #:set present_val = GEN_PRESENT_STR(present)
-
     #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
     #:set attach_val = GEN_ATTACH_STR(attach)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
         & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
         & copy_val.strip('\n') + copyin_val.strip('\n') + &
@@ -246,16 +232,13 @@
 #:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None)
     #:assert isinstance(cray_inline, bool)
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
     #:assert isinstance(nohost, bool)
     #:if nohost == True
         #:set nohost_val = 'nohost'
     #:else
         #:set nohost_val = ''
     #:endif
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
     #:set acc_directive = '!$acc routine ' + &
         & clause_val + extraAccArgs_val.strip('\n')
@@ -275,23 +258,14 @@
 #:enddef
 
 #:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
-    
     #:set copy_val = GEN_COPY_STR(copy)
-
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
     #:set create_val = GEN_CREATE_STR(create)
-
     #:set present_val = GEN_PRESENT_STR(present)
-
     #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
     #:set link_val = GEN_LINK_STR(link)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & present_val.strip('\n') + deviceptr_val.strip('\n') + &
@@ -301,11 +275,8 @@
 #:enddef
 
 #:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
-
     #:set collapse_val = GEN_COLLAPSE_STR(collapse)
-
     #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
-
     #:if data_dependency is not None
         #:assert isinstance(data_dependency, str)
         #:assert (data_dependency == 'auto' or data_dependency == 'independent')
@@ -313,13 +284,9 @@
     #:else
         #:set data_dependency_val = ''
     #:endif
-    
     #:set private_val = GEN_PRIVATE_STR(private, False)
-
     #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
         & data_dependency_val.strip('\n') + private_val.strip('\n') + &
         & reduction_val.strip('\n')
@@ -334,27 +301,16 @@
     #:if code == '' or code.isspace()
         #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code'
     #:endif
-
     #:set copy_val = GEN_COPY_STR(copy)
-
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
     #:set create_val = GEN_CREATE_STR(create)
-
     #:set no_create_val = GEN_NOCREATE_STR(no_create)
-
     #:set present_val = GEN_PRESENT_STR(present)
-
     #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
-
     #:set attach_val = GEN_ATTACH_STR(attach)
-
     #:set default_val = GEN_DEFAULT_STR(default)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & no_create_val.strip('\n') + present_val.strip('\n') + & 
@@ -375,7 +331,6 @@
     #:endif
     #:set use_device_val = GEN_USE_DEVICE_STR(use_device)
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = use_device_val.strip('\n')
     #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n')
     #:set end_acc_directive = '!$acc end host_data'
@@ -386,13 +341,9 @@
 
 #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
     #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
-
     #:set create_val = GEN_CREATE_STR(create)
-
     #:set attach_val = GEN_ATTACH_STR(attach)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
     #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -400,13 +351,9 @@
 
 #:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
     #:set copyout_val = GEN_COPYOUT_STR(copyout)
-
     #:set delete_val = GEN_DELETE_STR(delete)
-
     #:set detach_val = GEN_DETACH_STR(detach)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n')
     #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -414,9 +361,7 @@
 
 #:def GPU_CACHE(cache, extraAccArgs=None)
     #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = cache_val.strip('\n')
     #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -425,11 +370,8 @@
 #:def GPU_ATOMIC(atomic, extraAccArgs=None)
     #:assert isinstance(atomic, str)
     #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
-
     #:set atomic_val = atomic
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = atomic_val.strip('\n')
     #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -437,11 +379,8 @@
 
 #:def GPU_UPDATE(host=None, device=None, extraAccArgs=None)
     #:set host_val = GEN_HOST_STR(host)
-
     #:set device_val = GEN_DEVICE_STR(device)
-
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
     #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
@@ -449,9 +388,10 @@
 
 #:def GPU_WAIT(extraAccArgs=None)
     #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
-
     #:set clause_val = ''
     #:set acc_directive = '!$acc wait ' + clause_val + extraAccArgs_val.strip('\n')
     $:acc_directive
 #:enddef
-! New line at end of file is required for FYPP
\ No newline at end of file
+
+#:endmute
+! New line at end of file is required for FYPP
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 2d92f371e1..2c48c760f2 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -1405,7 +1405,7 @@ contains
     end subroutine s_color_function_reflective
 
     pure subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
-        $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', & 
+        $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', &
             & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 08034d3106..3dff4c241f 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -457,7 +457,6 @@ contains
         $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', &
             & parallelism='[seq]', cray_inline=True)
 
-
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
 
         real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
@@ -1597,7 +1596,6 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_speed_of_sound', &
             & parallelism='[seq]', cray_inline=True)
 
-
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf
         real(wp), intent(in) :: H
@@ -1665,7 +1663,6 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_fast_magnetosonic_speed', &
             & parallelism='[seq]', cray_inline=True)
 
-
         real(wp), intent(in) :: B(3), rho, c
         real(wp), intent(in) :: h ! only used for relativity
         real(wp), intent(out) :: c_fast
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 425e1be060..87f612a76b 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -162,138 +162,138 @@ contains
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
         #:call GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
-        #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
+            #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
 #if defined(__PGI)
-        ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+                ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-        ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
-        call hipCheck(hipDeviceSynchronize())
+                ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
+                call hipCheck(hipDeviceSynchronize())
 #endif
-        #:endcall GPU_HOST_DATA
-        Nfq = 3
-        $:GPU_UPDATE(device='[Nfq]')
+            #:endcall GPU_HOST_DATA
+            Nfq = 3
+            $:GPU_UPDATE(device='[Nfq]')
 
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 1, Nfq
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                    end do
                 end do
             end do
-        end do
 
-        #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+            #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
-        ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+                ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-        ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
-        call hipCheck(hipDeviceSynchronize())
+                ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
+                call hipCheck(hipDeviceSynchronize())
 #endif
-        #:endcall GPU_HOST_DATA
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 0, p
-                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
-                end do
-            end do
-        end do
-
-        do i = 1, fourier_rings
+            #:endcall GPU_HOST_DATA
 
             $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
-                    do l = 1, cmplx_size
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
                     end do
                 end do
             end do
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+            do i = 1, fourier_rings
+
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 1, cmplx_size
+                            data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                        end do
+                    end do
+                end do
+
+                $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 0, p
+                            data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+                        end do
                     end do
                 end do
-            end do
 
-            #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
+                #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
 #if defined(__PGI)
-            ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+                    ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-            ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
-            call hipCheck(hipDeviceSynchronize())
+                    ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
+                    call hipCheck(hipDeviceSynchronize())
 #endif
-            #:endcall GPU_HOST_DATA
+                #:endcall GPU_HOST_DATA
 
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            $:GPU_UPDATE(device='[Nfq]')
+                Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+                $:GPU_UPDATE(device='[Nfq]')
 
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 1, Nfq
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 1, Nfq
+                            data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                        end do
                     end do
                 end do
-            end do
 
-            #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
+                #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
-            ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+                    ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-            ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
-            call hipCheck(hipDeviceSynchronize())
+                    ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
+                    call hipCheck(hipDeviceSynchronize())
 #endif
-            #:endcall GPU_HOST_DATA
+                #:endcall GPU_HOST_DATA
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                        q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 0, p
+                            data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                            q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                        end do
                     end do
                 end do
-            end do
 
-        end do
-
-#else
-        Nfq = 3
-        do j = 0, m
-            do k = 1, sys_size
-                data_fltr_cmplx(:) = (0_dp, 0_dp)
-                data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
-                call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-                data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-                call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-                data_real(:) = data_real(:)/real(real_size, dp)
-                q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
             end do
-        end do
 
-        ! Apply Fourier filter to additional rings
-        do i = 1, fourier_rings
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+#else
+            Nfq = 3
             do j = 0, m
                 do k = 1, sys_size
                     data_fltr_cmplx(:) = (0_dp, 0_dp)
-                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
                     call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
                     data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
                     call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
                     data_real(:) = data_real(:)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+                    q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
+                end do
+            end do
+
+            ! Apply Fourier filter to additional rings
+            do i = 1, fourier_rings
+                Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+                do j = 0, m
+                    do k = 1, sys_size
+                        data_fltr_cmplx(:) = (0_dp, 0_dp)
+                        data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+                        call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+                        data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+                        call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+                        data_real(:) = data_real(:)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+                    end do
                 end do
             end do
-        end do
 #endif
-#:endcall GPU_DATA
+        #:endcall GPU_DATA
     end subroutine s_apply_fourier_filter
 
     !>  The purpose of this subroutine is to destroy the fftw plan
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 40301942b6..a104937575 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -637,7 +637,6 @@ contains
         $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', &
             & cray_inline=True)
 
-
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index f7eeec4a7a..73ff699454 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1149,7 +1149,7 @@ contains
                         use CuTensorEx
 
                         #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
-                        v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
+                            v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
                         #:endcall GPU_HOST_DATA
                     end block
                 else
@@ -1157,7 +1157,7 @@ contains
                         use CuTensorEx
 
                         #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
-                        v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
+                            v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
                         #:endcall GPU_HOST_DATA
                     end block
                 end if
@@ -1187,7 +1187,7 @@ contains
                     use CuTensorEx
 
                     #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
-                    v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
+                        v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
                     #:endcall
                 end block
             else

From 31b078279f8b2ccdf9da01c3c63ed1ea7c41ceb6 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 19 Jun 2025 22:30:01 -0400
Subject: [PATCH 65/75] Had extra variables from a merge

---
 src/simulation/m_cbc.fpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 557ad6d1ae..55c385c78f 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -50,7 +50,6 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
     $:GPU_DECLARE(create='[q_prim_rsx_vf,q_prim_rsy_vf,q_prim_rsz_vf]')
 
-    type(scalar_field), allocatable, dimension(:) :: F_rs_vf, F_src_rs_vf !<
 
     !! Cell-average fluxes (src - source). These are directly determined from the
     !! cell-average primitive variables, q_prims_rs_vf, and not a Riemann solver.

From 50af22748163829db7d04d2b5ad00a441f38de13 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Thu, 19 Jun 2025 22:36:55 -0400
Subject: [PATCH 66/75] Ran formatter

---
 src/simulation/m_cbc.fpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 55c385c78f..b92812ae7a 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -50,7 +50,6 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
     $:GPU_DECLARE(create='[q_prim_rsx_vf,q_prim_rsy_vf,q_prim_rsz_vf]')
 
-
     !! Cell-average fluxes (src - source). These are directly determined from the
     !! cell-average primitive variables, q_prims_rs_vf, and not a Riemann solver.
 

From 70c679bb6176a131aa4cfbc6af2ea0ffc7272669 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Mon, 23 Jun 2025 20:54:16 -0400
Subject: [PATCH 67/75] Added gpu nacro docs

---
 docs/documentation/gpuParallelization.md | 328 +++++++++++++++++++++++
 1 file changed, 328 insertions(+)
 create mode 100644 docs/documentation/gpuParallelization.md

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
new file mode 100644
index 0000000000..34602c42ce
--- /dev/null
+++ b/docs/documentation/gpuParallelization.md
@@ -0,0 +1,328 @@
+# Parallelization via GPUs
+
+MFC compiles GPU code via OpenACC and in the future OpenMP as well.
+
+In order to swap between OpenACC and OpenMP, ACC and MP directives are implemented via FYPP macros.
+
+[OpenACC Quick start Guide](https://openacc-best-practices-guide.readthedocs.io/en/latest/01-Introduction.html)
+[OpenACC API Documentation](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf)
+
+------------------------------------------------------------------------------------------
+
+## Macro API Documentation
+
+Note: Ordering is not guarennteed or stable, so use key-value pairing when using macros
+
+### Data Type Meanings
+
+- Integer is a number
+
+- Boolean is a pythonic boolean - Valid options: `True` or `False`
+
+- String List is given as a comma separated list surrounding by brackets and inside quotations
+  - Ex: `'[hello, world, Fortran]'`
+
+- 2-level string list is given as a comma separated list of string lists surrounding by brackets and inside quotations
+  - Ex: `'[[hello, world], [Fortran, MFC]]'` or `[[hello]]`
+
+### Data Flow
+
+- Data on the GPU has a reference counter
+- When a variable is referred to being allocated, it means that GPU memory is allocated if it is not already present in GPU memory. If a variable is already present, the reference counter is just incremented.
+- When a variable is referred to being dellocated, it means that the reference counter is decremented. If the reference counter is zero, then the data is actually deallocated from GPU memory
+- When a variable is referred to being attached, it means that the device pointer attaches to target if it not already attached. If pointer is already attached, then the attachment counter is just incremented
+- When a variable is referred to being detached, it means that the attachment counter is decremented. If attachment counter is zero, then actually detached
+
+### Creating new/overwriting existing stubs & proxy configs
+
+<details>
+ <summary><code>GPU_PARALLEL_LOOP</code> <code>(Execute the following loop on the GPU in parallel)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_PARALLEL_LOOP(...)`
+
+#### Parameters
+
+> | name             | data type           | Default Value     | description                                                                                  |
+> |------------------|---------------------|-------------------|----------------------------------------------------------------------------------------------|
+> | `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                       |
+> | `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                                 |
+> | `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                    |
+> | `private`        | string list         | None              | Variables that are private to each iteration/thread                                          |
+> | `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                              |
+> | `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                    |
+> | `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                        |
+> | `copy`           | string list         | None              | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+> | `copyin`         | string list         | None              | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+> | `copyinReadOnly` | string list         | None              | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+> | `copyout`        | string list         | None              | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
+> | `create`         | string list         | None              | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+> | `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                                  |
+> | `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+> | `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                   |
+> | `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                   |
+> | `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                                 |
+
+#### Parameter Restrictions
+
+| name        | Restricted range                                  |
+|-------------|---------------------------------------------------|
+| collapse    | Must be greater than 1                            |
+| parallelism | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| default     | 'present' or 'none'                               |
+
+#### Additional information
+
+- default present means that the any non-scalar data in assumed to be present on the GPU
+- default none means that the compiler should not implicitly determine the data attributes for any variable
+- reduction and reductionOp must match in length
+- With `reduction='[[sum1, sum2], [largest]]'` and `reductionOp='[+, max]'`, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- A reduction implies a copy, so it does not need to be added for both
+
+#### Example
+
+> ```python
+>  $:GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]')
+>  $:GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]')
+> ```
+
+</details>
+
+------------------------------------------------------------------------------------------
+
+### Listing existing stubs & proxy configs as YAML string
+
+<details>
+ <summary><code>GPU_ROUTINE</code> <code>(Compile a procedure for the GPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_ROUTINE(...)`
+
+
+#### Parameters
+
+| name            | data type   | Default Value | description                                                  |
+|-----------------|-------------|---------------|--------------------------------------------------------------|
+| `function_name` | string      | None          | Name of subroutine/function                                  |
+| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
+| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
+| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
+| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Parameter Restrictions
+
+| name        | Restricted range                                  |
+|-------------|---------------------------------------------------|
+| parallelism | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+
+#### Additional information
+
+- Function name only needs to be given when cray_inline is True
+- Future capability is to parse function header for function name
+- Routine parallelism is most commonly `'[seq]'`
+
+#### Example
+
+> ```python
+>  $:GPU_ROUTINE(parallelism='[seq]')
+>  $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_DECLARE</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_DECLARE(...)`
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+#### Additional information
+
+- An implicit data region is created at the start of each procedure
+and ends after the last executable statement in that procedure.
+- Use only create, copyin, device_resident or link clauses for module variables
+- GPU_DECLARE exit is the end of the implicit data region
+- Link is useful for large global static data objects
+
+#### Example
+
+> ```python
+>  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
+>  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
+> ```
+
+</details>
+
+<details>
+  <summary><code>GPU_LOOP</code> <code>(Execute loop on GPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_LOOP(...)`
+
+#### Parameters
+
+| name              | data type           | Default Value | description                                                                                      |
+|-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
+| `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
+| `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
+| `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
+| `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
+| `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
+| `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
+| `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
+
+#### Parameter Restrictions
+
+| name            | Restricted range                                  |
+|-----------------|---------------------------------------------------|
+| collapse        | Must be greater than 1                            |
+| parallelism     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| data_dependency | 'auto' or 'independent'                           |
+
+#### Additional information
+
+- Loop parallelism is most commonly `'[seq]'`
+- reduction and reductionOp must match in length
+- With `reduction='[[sum1, sum2], [largest]]'` and `reductionOp='[+, max]'`, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+
+#### Example
+
+> ```python
+>  $:GPU_PARALLEL_LOOP(parallelism='[seq]')
+>  $:GPU_PARALLEL_LOOP(collapse=3, parallelism='[seq]',private='[tmp, r]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_DATA</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP call directive using `#:call`
+> ```C
+>
+> #:call GPU_DATA(...)
+>    {code}
+> #:endcall GPU_DATA 
+>```
+> 
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
+| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
+| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+#### Parameter Restrictions
+
+| name            | Restricted range                                  |
+|-----------------|---------------------------------------------------|
+| code        | Do not assign it manually with key-value pairing                            |
+
+#### Additional information
+
+#### Example
+
+> ```C
+>  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, inital_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+>       {code}
+>       ...
+>  #:endcall
+>  #:call GPU_DATA(create='[pixel_arr]', copyin='[inital_index]')
+>       {code}
+>       ...
+>   #:endcall
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_DATA</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP call directive using `#:call`
+> ```C
+>
+> #:call GPU_DATA(...)
+>    {code}
+> #:endcall GPU_DATA 
+>```
+> 
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
+| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
+| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+#### Parameter Restrictions
+
+| name            | Restricted range                                  |
+|-----------------|---------------------------------------------------|
+| code        | Do not assign it manually with key-value pairing                            |
+
+#### Additional information
+
+#### Example
+
+> ```C
+>  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, inital_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+>       {code}
+>       ...
+>  #:endcall
+>  #:call GPU_DATA(create='[pixel_arr]', copyin='[inital_index]')
+>       {code}
+>       ...
+>   #:endcall
+> ```
+
+</details>
+
+
+------------------------------------------------------------------------------------------

From 58994926b106d8462fadf17873a8c249111189c3 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Mon, 23 Jun 2025 21:37:07 -0400
Subject: [PATCH 68/75] more gpu macro docs

---
 docs/documentation/gpuParallelization.md | 195 ++++++++++++++++++-----
 1 file changed, 158 insertions(+), 37 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index 34602c42ce..9f4953bcc1 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -54,11 +54,11 @@ Uses FYPP eval directive using `$:`
 > | `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                              |
 > | `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                    |
 > | `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                        |
-> | `copy`           | string list         | None              | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-> | `copyin`         | string list         | None              | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
-> | `copyinReadOnly` | string list         | None              | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-> | `copyout`        | string list         | None              | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
-> | `create`         | string list         | None              | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+> | `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
+> | `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit              |
+> | `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                 |
+> | `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
+> | `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                         |
 > | `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                                  |
 > | `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
 > | `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                   |
@@ -146,11 +146,11 @@ Uses FYPP eval directive using `$:`
 
 | name             | data type   | Default Value | description                                                                                  |
 |------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
 | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
 | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
 | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
 | `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                    |
@@ -217,52 +217,43 @@ Uses FYPP eval directive using `$:`
 </details>
 
 <details>
- <summary><code>GPU_DATA</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+ <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address avaliable on CPU)</code></summary>
 
 #### Macro Invocation
 
 Uses FYPP call directive using `#:call`
 > ```C
 >
-> #:call GPU_DATA(...)
+> #:call GPU_HOST_DATA(...)
 >    {code}
-> #:endcall GPU_DATA 
+> #:endcall GPU_HOST_DATA 
 >```
 > 
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                                                  |
-|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
-| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
-| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
-| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
-| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
-| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
-| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+| name           | data type   | Default Value | description                                                      |
+|----------------|-------------|---------------|------------------------------------------------------------------|
+| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
+| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
 
 #### Parameter Restrictions
 
-| name            | Restricted range                                  |
-|-----------------|---------------------------------------------------|
-| code        | Do not assign it manually with key-value pairing                            |
+| name | Restricted range                                 |
+|------|--------------------------------------------------|
+| code | Do not assign it manually with key-value pairing |
 
 #### Additional information
 
 #### Example
 
 > ```C
->  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, inital_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+>  #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
 >       {code}
 >       ...
->  #:endcall
->  #:call GPU_DATA(create='[pixel_arr]', copyin='[inital_index]')
+>  #:endcall GPU_HOST_DATA
+>  #:call GPU_HOST_DATA(use_device='[display_arr]')
 >       {code}
 >       ...
 >   #:endcall
@@ -271,7 +262,7 @@ Uses FYPP call directive using `#:call`
 </details>
 
 <details>
- <summary><code>GPU_DATA</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+ <summary><code>GPU_DATA</code> <code>(Make data accessible on GPU in specified region)</code></summary>
 
 #### Macro Invocation
 
@@ -290,10 +281,10 @@ Uses FYPP call directive using `#:call`
 |------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
 | `code`           | code        | Required      | Region of code where defined data is accessible                                              |
 | `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies a variable to GPU on entrance and then deallocated on exit              |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
 | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates a variable on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates a variable on GPU on entrance and then deallocates on exit                         |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
 | `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
 | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
 | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
@@ -315,7 +306,7 @@ Uses FYPP call directive using `#:call`
 >  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, inital_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
 >       {code}
 >       ...
->  #:endcall
+>  #:endcall GPU_DATA
 >  #:call GPU_DATA(create='[pixel_arr]', copyin='[inital_index]')
 >       {code}
 >       ...
@@ -324,5 +315,135 @@ Uses FYPP call directive using `#:call`
 
 </details>
 
+<details>
+ <summary><code>GPU_ENTER_DATA</code> <code>(Allocate/move data to GPU until matching GPU_EXIT_DATA or program termination)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_ENTER_DATA(...)`
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance           |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
+| `create`         | string list | None          | Allocates data on GPU on entrance                      |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, inital_index]')
+>  $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[inital_index]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_EXIT_DATA</code> <code>(Deallocate/move data from GPU created by GPU_ENTER_DATA)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_EXIT_DATA(...)`
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `copyout`         | string list | None          | Deallocates and copies data from GPU to CPU on exit           |
+| `delete`         | string list | None          | Deallocates data on GPU on exit                      |
+| `detach`         | string list | None          | Detach device pointer from device targets on exit        |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, inital_index]')
+>  $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[inital_index]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_CACHE</code> <code>(Data to be cache in software-managed cache)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_CACHE(...)`
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `cache`         | string list | Required          | Data that should to stored in cache           |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_CACHE(cache='[pixels_arr]')
+> ```
+
+</details>
+
+GPU_ATOMIC(atomic, extraAccArgs=None)
+
+<details>
+ <summary><code>GPU_ATOMIC</code> <code>(Do an atomic operation on the GPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_ATOMIC(...)`
+
+#### Parameters
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `atomic`         | string | Required          | Which atomic operation is performed           |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Parameter Restrictions
+
+| name            | Restricted range                                  |
+|-----------------|---------------------------------------------------|
+| atomic        | 'read', 'write', 'update', or 'capture'                            |
+
+#### Additional information
+
+- read atomic is reading in a value
+    - Ex: `v=x`
+- write atomic is writing a value to a variable
+    - Ex:`x=square(tmp)`
+- update atomic is updating a variable in-place
+    - Ex:`x= x .and. 1`
+- Capture is a pair of read/write/update operations with one dependent on the other
+    - Ex: ```
+        `x=x .and. 1`
+        `v=x`
+    ```
+    
 
+#### Example
+
+> ```python
+>  $:GPU_CACHE(cache='[update]')
+>  x = square(x)
+>  $:GPU_CACHE(cache='[capture]')
+>  x = square(x)
+>  v = x
+> ```
+
+</details>
 ------------------------------------------------------------------------------------------

From df1a27b1c9100a19f555979f80102b8bacaf4244 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Tue, 24 Jun 2025 13:02:01 -0400
Subject: [PATCH 69/75] Finished up gpu docs

---
 docs/documentation/gpuParallelization.md | 125 ++++++++++++++++-------
 1 file changed, 87 insertions(+), 38 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index 9f4953bcc1..2b60d17258 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -28,10 +28,10 @@ Note: Ordering is not guarennteed or stable, so use key-value pairing when using
 ### Data Flow
 
 - Data on the GPU has a reference counter
-- When a variable is referred to being allocated, it means that GPU memory is allocated if it is not already present in GPU memory. If a variable is already present, the reference counter is just incremented.
-- When a variable is referred to being dellocated, it means that the reference counter is decremented. If the reference counter is zero, then the data is actually deallocated from GPU memory
-- When a variable is referred to being attached, it means that the device pointer attaches to target if it not already attached. If pointer is already attached, then the attachment counter is just incremented
-- When a variable is referred to being detached, it means that the attachment counter is decremented. If attachment counter is zero, then actually detached
+- When data is referred to being allocated, it means that GPU memory is allocated if it is not already present in GPU memory. If a variable is already present, the reference counter is just incremented.
+- When data is referred to being dellocated, it means that the reference counter is decremented. If the reference counter is zero, then the data is actually deallocated from GPU memory
+- When data is referred to being attached, it means that the device pointer attaches to target if it not already attached. If pointer is already attached, then the attachment counter is just incremented
+- When data is referred to being detached, it means that the attachment counter is decremented. If attachment counter is zero, then actually detached
 
 ### Creating new/overwriting existing stubs & proxy configs
 
@@ -45,25 +45,25 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-> | name             | data type           | Default Value     | description                                                                                  |
-> |------------------|---------------------|-------------------|----------------------------------------------------------------------------------------------|
-> | `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                       |
-> | `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                                 |
-> | `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                    |
-> | `private`        | string list         | None              | Variables that are private to each iteration/thread                                          |
-> | `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                              |
-> | `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                    |
-> | `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                        |
-> | `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
-> | `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit              |
-> | `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                 |
-> | `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
-> | `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                         |
-> | `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                                  |
-> | `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-> | `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                   |
-> | `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                   |
-> | `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                                 |
+> | name             | data type           | Default Value     | description                                                                               |
+> |------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
+> | `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                    |
+> | `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                              |
+> | `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
+> | `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
+> | `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
+> | `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
+> | `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
+> | `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+> | `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+> | `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
+> | `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+> | `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
+> | `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
+> | `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+> | `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
+> | `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
+> | `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
 
 #### Parameter Restrictions
 
@@ -217,7 +217,7 @@ Uses FYPP eval directive using `$:`
 </details>
 
 <details>
- <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address avaliable on CPU)</code></summary>
+ <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address available on CPU)</code></summary>
 
 #### Macro Invocation
 
@@ -303,11 +303,11 @@ Uses FYPP call directive using `#:call`
 #### Example
 
 > ```C
->  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, inital_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+>  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, initial_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
 >       {code}
 >       ...
 >  #:endcall GPU_DATA
->  #:call GPU_DATA(create='[pixel_arr]', copyin='[inital_index]')
+>  #:call GPU_DATA(create='[pixel_arr]', copyin='[initial_index]')
 >       {code}
 >       ...
 >   #:endcall
@@ -338,8 +338,8 @@ Uses FYPP eval directive using `$:`
 #### Example
 
 > ```python
->  $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, inital_index]')
->  $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[inital_index]')
+>  $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, initial_index]')
+>  $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[initial_index]')
 > ```
 
 </details>
@@ -366,8 +366,8 @@ Uses FYPP eval directive using `$:`
 #### Example
 
 > ```python
->  $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, inital_index]')
->  $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[inital_index]')
+>  $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, initial_index]')
+>  $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[initial_index]')
 > ```
 
 </details>
@@ -397,8 +397,6 @@ Uses FYPP eval directive using `$:`
 
 </details>
 
-GPU_ATOMIC(atomic, extraAccArgs=None)
-
 <details>
  <summary><code>GPU_ATOMIC</code> <code>(Do an atomic operation on the GPU)</code></summary>
 
@@ -409,10 +407,10 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                  |
-|------------------|-------------|---------------|--------------------------------------------------------------|
-| `atomic`         | string | Required          | Which atomic operation is performed           |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `atomic`       | string    | Required      | Which atomic operation is performed                          |
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Parameter Restrictions
 
@@ -438,12 +436,63 @@ Uses FYPP eval directive using `$:`
 #### Example
 
 > ```python
->  $:GPU_CACHE(cache='[update]')
+>  $:GPU_ATOMIC(atomic='update')
 >  x = square(x)
->  $:GPU_CACHE(cache='[capture]')
+>  $:GPU_ATOMIC(atomic='capture')
 >  x = square(x)
 >  v = x
 > ```
 
+</details>
+
+<details>
+ <summary><code>GPU_UPDATE</code> <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_UPDATE(...)`
+
+#### Parameters
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `host`       | string list   | None      | Updates data from GPU to CPU                         |
+| `device`       | string list   | None      | Updates data from CPU to GPU     |
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_UPDATE(host='[arr1, arr2]')
+>  $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_WAIT</code> <code>(Makes CPU wait for async GPU activities)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_WAIT(...)`
+
+#### Parameters
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_WAIT()
+> ```
+
 </details>
 ------------------------------------------------------------------------------------------

From 9036bfdf25a21fc323298a98afe8cb7bd4771e62 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Tue, 24 Jun 2025 13:20:31 -0400
Subject: [PATCH 70/75] Reorganized macros into logical groups

---
 docs/documentation/gpuParallelization.md | 324 ++++++++++++-----------
 1 file changed, 167 insertions(+), 157 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index 2b60d17258..75f657c855 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -33,7 +33,9 @@ Note: Ordering is not guarennteed or stable, so use key-value pairing when using
 - When data is referred to being attached, it means that the device pointer attaches to target if it not already attached. If pointer is already attached, then the attachment counter is just incremented
 - When data is referred to being detached, it means that the attachment counter is decremented. If attachment counter is zero, then actually detached
 
-### Creating new/overwriting existing stubs & proxy configs
+------------------------------------------------------------------------------------------
+
+### Computation Macros
 
 <details>
  <summary><code>GPU_PARALLEL_LOOP</code> <code>(Execute the following loop on the GPU in parallel)</code></summary>
@@ -69,9 +71,9 @@ Uses FYPP eval directive using `$:`
 
 | name        | Restricted range                                  |
 |-------------|---------------------------------------------------|
-| collapse    | Must be greater than 1                            |
-| parallelism | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-| default     | 'present' or 'none'                               |
+| `collapse`    | Must be greater than 1                            |
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `default`     | 'present' or 'none'                               |
 
 #### Additional information
 
@@ -90,89 +92,6 @@ Uses FYPP eval directive using `$:`
 
 </details>
 
-------------------------------------------------------------------------------------------
-
-### Listing existing stubs & proxy configs as YAML string
-
-<details>
- <summary><code>GPU_ROUTINE</code> <code>(Compile a procedure for the GPU)</code></summary>
-
-#### Macro Invocation
-
-Uses FYPP eval directive using `$:`
-> `$:GPU_ROUTINE(...)`
-
-
-#### Parameters
-
-| name            | data type   | Default Value | description                                                  |
-|-----------------|-------------|---------------|--------------------------------------------------------------|
-| `function_name` | string      | None          | Name of subroutine/function                                  |
-| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
-| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
-| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
-| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Parameter Restrictions
-
-| name        | Restricted range                                  |
-|-------------|---------------------------------------------------|
-| parallelism | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-
-#### Additional information
-
-- Function name only needs to be given when cray_inline is True
-- Future capability is to parse function header for function name
-- Routine parallelism is most commonly `'[seq]'`
-
-#### Example
-
-> ```python
->  $:GPU_ROUTINE(parallelism='[seq]')
->  $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
-> ```
-
-</details>
-
-<details>
- <summary><code>GPU_DECLARE</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
-
-#### Macro Invocation
-
-Uses FYPP eval directive using `$:`
-> `$:GPU_DECLARE(...)`
-
-#### Parameters
-
-| name             | data type   | Default Value | description                                                                                  |
-|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
-| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
-| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
-| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                    |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
-
-#### Additional information
-
-- An implicit data region is created at the start of each procedure
-and ends after the last executable statement in that procedure.
-- Use only create, copyin, device_resident or link clauses for module variables
-- GPU_DECLARE exit is the end of the implicit data region
-- Link is useful for large global static data objects
-
-#### Example
-
-> ```python
->  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
->  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
-> ```
-
-</details>
-
 <details>
   <summary><code>GPU_LOOP</code> <code>(Execute loop on GPU)</code></summary>
 
@@ -197,9 +116,9 @@ Uses FYPP eval directive using `$:`
 
 | name            | Restricted range                                  |
 |-----------------|---------------------------------------------------|
-| collapse        | Must be greater than 1                            |
-| parallelism     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-| data_dependency | 'auto' or 'independent'                           |
+| `collapse`        | Must be greater than 1                            |
+| `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `data_dependency` | 'auto' or 'independent'                           |
 
 #### Additional information
 
@@ -216,50 +135,9 @@ Uses FYPP eval directive using `$:`
 
 </details>
 
-<details>
- <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address available on CPU)</code></summary>
-
-#### Macro Invocation
-
-Uses FYPP call directive using `#:call`
-> ```C
->
-> #:call GPU_HOST_DATA(...)
->    {code}
-> #:endcall GPU_HOST_DATA 
->```
-> 
-
-#### Parameters
-
-| name           | data type   | Default Value | description                                                      |
-|----------------|-------------|---------------|------------------------------------------------------------------|
-| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
-| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
-| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
-
-#### Parameter Restrictions
-
-| name | Restricted range                                 |
-|------|--------------------------------------------------|
-| code | Do not assign it manually with key-value pairing |
-
-#### Additional information
-
-#### Example
-
-> ```C
->  #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
->       {code}
->       ...
->  #:endcall GPU_HOST_DATA
->  #:call GPU_HOST_DATA(use_device='[display_arr]')
->       {code}
->       ...
->   #:endcall
-> ```
+------------------------------------------------------------------------------------------
 
-</details>
+### Data Control Macros
 
 <details>
  <summary><code>GPU_DATA</code> <code>(Make data accessible on GPU in specified region)</code></summary>
@@ -296,7 +174,7 @@ Uses FYPP call directive using `#:call`
 
 | name            | Restricted range                                  |
 |-----------------|---------------------------------------------------|
-| code        | Do not assign it manually with key-value pairing                            |
+| `code`        | Do not assign it manually with key-value pairing                            |
 
 #### Additional information
 
@@ -373,26 +251,140 @@ Uses FYPP eval directive using `$:`
 </details>
 
 <details>
- <summary><code>GPU_CACHE</code> <code>(Data to be cache in software-managed cache)</code></summary>
+ <summary><code>GPU_DECLARE</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
 
 #### Macro Invocation
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_CACHE(...)`
+> `$:GPU_DECLARE(...)`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                  |
-|------------------|-------------|---------------|--------------------------------------------------------------|
-| `cache`         | string list | Required          | Data that should to stored in cache           |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+#### Additional information
+
+- An implicit data region is created at the start of each procedure
+and ends after the last executable statement in that procedure.
+- Use only create, copyin, device_resident or link clauses for module variables
+- GPU_DECLARE exit is the end of the implicit data region
+- Link is useful for large global static data objects
+
+#### Example
+
+> ```python
+>  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
+>  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_UPDATE</code> <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_UPDATE(...)`
+
+#### Parameters
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `host`       | string list   | None      | Updates data from GPU to CPU                         |
+| `device`       | string list   | None      | Updates data from CPU to GPU     |
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Additional information
 
 #### Example
 
 > ```python
->  $:GPU_CACHE(cache='[pixels_arr]')
+>  $:GPU_UPDATE(host='[arr1, arr2]')
+>  $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
+> ```
+
+</details>
+
+<details>
+ <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address available on CPU)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP call directive using `#:call`
+> ```C
+>
+> #:call GPU_HOST_DATA(...)
+>    {code}
+> #:endcall GPU_HOST_DATA 
+>```
+> 
+
+#### Parameters
+
+| name           | data type   | Default Value | description                                                      |
+|----------------|-------------|---------------|------------------------------------------------------------------|
+| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
+| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
+
+#### Parameter Restrictions
+
+| name | Restricted range                                 |
+|------|--------------------------------------------------|
+| `code` | Do not assign it manually with key-value pairing |
+
+#### Additional information
+
+#### Example
+
+> ```C
+>  #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
+>       {code}
+>       ...
+>  #:endcall GPU_HOST_DATA
+>  #:call GPU_HOST_DATA(use_device='[display_arr]')
+>       {code}
+>       ...
+>   #:endcall
+> ```
+
+</details>
+
+------------------------------------------------------------------------------------------
+
+### Synchronization Macros
+
+<details>
+ <summary><code>GPU_WAIT</code> <code>(Makes CPU wait for async GPU activities)</code></summary>
+
+#### Macro Invocation
+
+Uses FYPP eval directive using `$:`
+> `$:GPU_WAIT(...)`
+
+#### Parameters
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Additional information
+
+#### Example
+
+> ```python
+>  $:GPU_WAIT()
 > ```
 
 </details>
@@ -416,7 +408,7 @@ Uses FYPP eval directive using `$:`
 
 | name            | Restricted range                                  |
 |-----------------|---------------------------------------------------|
-| atomic        | 'read', 'write', 'update', or 'capture'                            |
+| `atomic`        | 'read', 'write', 'update', or 'capture'                            |
 
 #### Additional information
 
@@ -445,54 +437,72 @@ Uses FYPP eval directive using `$:`
 
 </details>
 
+------------------------------------------------------------------------------------------
+
+### Miscellaneous Macros
+
 <details>
- <summary><code>GPU_UPDATE</code> <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
+ <summary><code>GPU_ROUTINE</code> <code>(Compile a procedure for the GPU)</code></summary>
 
 #### Macro Invocation
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_UPDATE(...)`
+> `$:GPU_ROUTINE(...)`
 
 #### Parameters
 
-| name           | data type | Default Value | description                                                  |
-|----------------|-----------|---------------|--------------------------------------------------------------|
-| `host`       | string list   | None      | Updates data from GPU to CPU                         |
-| `device`       | string list   | None      | Updates data from CPU to GPU     |
-| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+| name            | data type   | Default Value | description                                                  |
+|-----------------|-------------|---------------|--------------------------------------------------------------|
+| `function_name` | string      | None          | Name of subroutine/function                                  |
+| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
+| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
+| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
+| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+#### Parameter Restrictions
+
+| name        | Restricted range                                  |
+|-------------|---------------------------------------------------|
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
 
 #### Additional information
 
+- Function name only needs to be given when cray_inline is True
+- Future capability is to parse function header for function name
+- Routine parallelism is most commonly `'[seq]'`
+
 #### Example
 
 > ```python
->  $:GPU_UPDATE(host='[arr1, arr2]')
->  $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
+>  $:GPU_ROUTINE(parallelism='[seq]')
+>  $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
 > ```
 
 </details>
 
 <details>
- <summary><code>GPU_WAIT</code> <code>(Makes CPU wait for async GPU activities)</code></summary>
+ <summary><code>GPU_CACHE</code> <code>(Data to be cache in software-managed cache)</code></summary>
 
 #### Macro Invocation
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_WAIT(...)`
+> `$:GPU_CACHE(...)`
 
 #### Parameters
 
-| name           | data type | Default Value | description                                                  |
-|----------------|-----------|---------------|--------------------------------------------------------------|
-| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `cache`         | string list | Required          | Data that should to stored in cache           |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Additional information
 
 #### Example
 
 > ```python
->  $:GPU_WAIT()
+>  $:GPU_CACHE(cache='[pixels_arr]')
 > ```
 
 </details>
+
 ------------------------------------------------------------------------------------------

From 7a8cdc2475d539c44cab8ea2021b011f129ca26f Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Tue, 24 Jun 2025 13:37:16 -0400
Subject: [PATCH 71/75] Cleaned up docs

---
 docs/documentation/gpuParallelization.md | 234 +++++++++++------------
 1 file changed, 110 insertions(+), 124 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index 75f657c855..9c4a783a72 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -69,11 +69,11 @@ Uses FYPP eval directive using `$:`
 
 #### Parameter Restrictions
 
-| name        | Restricted range                                  |
-|-------------|---------------------------------------------------|
-| `collapse`    | Must be greater than 1                            |
-| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-| `default`     | 'present' or 'none'                               |
+> | name          | Restricted range                                  |
+> |---------------|---------------------------------------------------|
+> | `collapse`    | Must be greater than 1                            |
+> | `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+> | `default`     | 'present' or 'none'                               |
 
 #### Additional information
 
@@ -102,23 +102,23 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name              | data type           | Default Value | description                                                                                      |
-|-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
-| `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
-| `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
-| `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
-| `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
-| `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
-| `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
-| `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
+> | name              | data type           | Default Value | description                                                                                      |
+> |-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
+> | `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
+> | `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
+> | `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
+> | `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
+> | `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
+> | `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
+> | `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
 
 #### Parameter Restrictions
 
-| name            | Restricted range                                  |
-|-----------------|---------------------------------------------------|
-| `collapse`        | Must be greater than 1                            |
-| `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-| `data_dependency` | 'auto' or 'independent'                           |
+> | name              | Restricted range                                  |
+> |-------------------|---------------------------------------------------|
+> | `collapse`        | Must be greater than 1                            |
+> | `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+> | `data_dependency` | 'auto' or 'independent'                           |
 
 #### Additional information
 
@@ -145,38 +145,35 @@ Uses FYPP eval directive using `$:`
 #### Macro Invocation
 
 Uses FYPP call directive using `#:call`
+
 > ```C
->
 > #:call GPU_DATA(...)
 >    {code}
 > #:endcall GPU_DATA 
 >```
-> 
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                                                  |
-|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
-| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
-| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
-| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
-| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
-| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
-| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+> | name             | data type   | Default Value | description                                                                                  |
+> |------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+> | `code`           | code        | Required      | Region of code where defined data is accessible                                              |
+> | `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                    |
+> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+> | `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit             |
+> | `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                               |
+> | `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
+> | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+> | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+> | `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
+> | `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
+> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
 
 #### Parameter Restrictions
 
-| name            | Restricted range                                  |
-|-----------------|---------------------------------------------------|
-| `code`        | Do not assign it manually with key-value pairing                            |
-
-#### Additional information
+> | name   | Restricted range                                 |
+> |--------|--------------------------------------------------|
+> | `code` | Do not assign it manually with key-value pairing |
 
 #### Example
 
@@ -188,7 +185,7 @@ Uses FYPP call directive using `#:call`
 >  #:call GPU_DATA(create='[pixel_arr]', copyin='[initial_index]')
 >       {code}
 >       ...
->   #:endcall
+>  #:endcall
 > ```
 
 </details>
@@ -203,15 +200,13 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                  |
-|------------------|-------------|---------------|--------------------------------------------------------------|
-| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance           |
-| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
-| `create`         | string list | None          | Allocates data on GPU on entrance                      |
-| `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Additional information
+> | name             | data type   | Default Value | description                                                  |
+> |------------------|-------------|---------------|--------------------------------------------------------------|
+> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance                 |
+> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
+> | `create`         | string list | None          | Allocates data on GPU on entrance                            |
+> | `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
+> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Example
 
@@ -232,14 +227,12 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                  |
-|------------------|-------------|---------------|--------------------------------------------------------------|
-| `copyout`         | string list | None          | Deallocates and copies data from GPU to CPU on exit           |
-| `delete`         | string list | None          | Deallocates data on GPU on exit                      |
-| `detach`         | string list | None          | Detach device pointer from device targets on exit        |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Additional information
+> | name           | data type   | Default Value | description                                                  |
+> |----------------|-------------|---------------|--------------------------------------------------------------|
+> | `copyout`      | string list | None          | Deallocates and copies data from GPU to CPU on exit          |
+> | `delete`       | string list | None          | Deallocates data on GPU on exit                              |
+> | `detach`       | string list | None          | Detach device pointer from device targets on exit            |
+> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Example
 
@@ -260,22 +253,21 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                                                  |
-|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
-| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit              |
-| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit       |
-| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                         |
-| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
-| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                    |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+> | name             | data type   | Default Value | description                                                                               |
+> |------------------|-------------|---------------|-------------------------------------------------------------------------------------------|
+> | `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit              |
+> | `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+> | `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                            |
+> | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+> | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                |
+> | `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                 |
+> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                              |
 
 #### Additional information
 
-- An implicit data region is created at the start of each procedure
-and ends after the last executable statement in that procedure.
+- An implicit data region is created at the start of each procedure and ends after the last executable statement in that procedure.
 - Use only create, copyin, device_resident or link clauses for module variables
 - GPU_DECLARE exit is the end of the implicit data region
 - Link is useful for large global static data objects
@@ -299,13 +291,11 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name           | data type | Default Value | description                                                  |
-|----------------|-----------|---------------|--------------------------------------------------------------|
-| `host`       | string list   | None      | Updates data from GPU to CPU                         |
-| `device`       | string list   | None      | Updates data from CPU to GPU     |
-| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Additional information
+> | name           | data type   | Default Value | description                                                  |
+> |----------------|-------------|---------------|--------------------------------------------------------------|
+> | `host`         | string list | None          | Updates data from GPU to CPU                                 |
+> | `device`       | string list | None          | Updates data from CPU to GPU                                 |
+> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Example
 
@@ -322,27 +312,26 @@ Uses FYPP eval directive using `$:`
 #### Macro Invocation
 
 Uses FYPP call directive using `#:call`
+
 > ```C
->
 > #:call GPU_HOST_DATA(...)
 >    {code}
 > #:endcall GPU_HOST_DATA 
 >```
-> 
 
 #### Parameters
 
-| name           | data type   | Default Value | description                                                      |
-|----------------|-------------|---------------|------------------------------------------------------------------|
-| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
-| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
-| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
+> | name           | data type   | Default Value | description                                                      |
+> |----------------|-------------|---------------|------------------------------------------------------------------|
+> | `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
+> | `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
+> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
 
 #### Parameter Restrictions
 
-| name | Restricted range                                 |
-|------|--------------------------------------------------|
-| `code` | Do not assign it manually with key-value pairing |
+> | name   | Restricted range                                 |
+> |--------|--------------------------------------------------|
+> | `code` | Do not assign it manually with key-value pairing |
 
 #### Additional information
 
@@ -375,11 +364,9 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name           | data type | Default Value | description                                                  |
-|----------------|-----------|---------------|--------------------------------------------------------------|
-| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Additional information
+> | name           | data type | Default Value | description                                                  |
+> |----------------|-----------|---------------|--------------------------------------------------------------|
+> | `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Example
 
@@ -399,31 +386,32 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name           | data type | Default Value | description                                                  |
-|----------------|-----------|---------------|--------------------------------------------------------------|
-| `atomic`       | string    | Required      | Which atomic operation is performed                          |
-| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+> | name           | data type | Default Value | description                                                  |
+> |----------------|-----------|---------------|--------------------------------------------------------------|
+> | `atomic`       | string    | Required      | Which atomic operation is performed                          |
+> | `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Parameter Restrictions
 
-| name            | Restricted range                                  |
-|-----------------|---------------------------------------------------|
-| `atomic`        | 'read', 'write', 'update', or 'capture'                            |
+> | name     | Restricted range                        |
+> |----------|-----------------------------------------|
+> | `atomic` | 'read', 'write', 'update', or 'capture' |
 
 #### Additional information
 
 - read atomic is reading in a value
-    - Ex: `v=x`
+  - Ex: `v=x`
 - write atomic is writing a value to a variable
-    - Ex:`x=square(tmp)`
+  - Ex:`x=square(tmp)`
 - update atomic is updating a variable in-place
-    - Ex:`x= x .and. 1`
+  - Ex:`x= x .and. 1`
 - Capture is a pair of read/write/update operations with one dependent on the other
-    - Ex: ```
-        `x=x .and. 1`
-        `v=x`
-    ```
-    
+  - Ex:
+
+  ```Fortran
+      x=x .and. 1
+      v=x
+  ```
 
 #### Example
 
@@ -451,19 +439,19 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name            | data type   | Default Value | description                                                  |
-|-----------------|-------------|---------------|--------------------------------------------------------------|
-| `function_name` | string      | None          | Name of subroutine/function                                  |
-| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
-| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
-| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
-| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
+> | name            | data type   | Default Value | description                                                  |
+> |-----------------|-------------|---------------|--------------------------------------------------------------|
+> | `function_name` | string      | None          | Name of subroutine/function                                  |
+> | `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
+> | `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
+> | `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
+> | `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Parameter Restrictions
 
-| name        | Restricted range                                  |
-|-------------|---------------------------------------------------|
-| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+> | name          | Restricted range                                  |
+> |---------------|---------------------------------------------------|
+> | `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
 
 #### Additional information
 
@@ -490,12 +478,10 @@ Uses FYPP eval directive using `$:`
 
 #### Parameters
 
-| name             | data type   | Default Value | description                                                  |
-|------------------|-------------|---------------|--------------------------------------------------------------|
-| `cache`         | string list | Required          | Data that should to stored in cache           |
-| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
-
-#### Additional information
+> | name             | data type   | Default Value | description                                                  |
+> |------------------|-------------|---------------|--------------------------------------------------------------|
+> | `cache`          | string list | Required      | Data that should to stored in cache                          |
+> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
 
 #### Example
 

From 6bc96a8a6395845de49d80d746d807501152a3a4 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tprathi3@gatech.edu>
Date: Tue, 24 Jun 2025 15:45:08 -0400
Subject: [PATCH 72/75] Fixes render issues with doxygen

---
 docs/documentation/gpuParallelization.md | 566 ++++++++++++-----------
 1 file changed, 288 insertions(+), 278 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index 9c4a783a72..bb0fbc667d 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -1,17 +1,19 @@
-# Parallelization via GPUs
+# GPU Parallelization
 
 MFC compiles GPU code via OpenACC and in the future OpenMP as well.
 
-In order to swap between OpenACC and OpenMP, ACC and MP directives are implemented via FYPP macros.
+In order to swap between OpenACC and OpenMP, custom GPU macros are used that translate to equivalent OpenACC and OpenMP directives.
+FYPP is used to process the GPU macros.
 
 [OpenACC Quick start Guide](https://openacc-best-practices-guide.readthedocs.io/en/latest/01-Introduction.html)
+
 [OpenACC API Documentation](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf)
 
 ------------------------------------------------------------------------------------------
 
 ## Macro API Documentation
 
-Note: Ordering is not guarennteed or stable, so use key-value pairing when using macros
+Note: Ordering is not guaranteed or stable, so use key-value pairing when using macros
 
 ### Data Type Meanings
 
@@ -20,10 +22,10 @@ Note: Ordering is not guarennteed or stable, so use key-value pairing when using
 - Boolean is a pythonic boolean - Valid options: `True` or `False`
 
 - String List is given as a comma separated list surrounding by brackets and inside quotations
-  - Ex: `'[hello, world, Fortran]'`
+  - Ex: ``'[hello, world, Fortran]'``
 
 - 2-level string list is given as a comma separated list of string lists surrounding by brackets and inside quotations
-  - Ex: `'[[hello, world], [Fortran, MFC]]'` or `[[hello]]`
+  - Ex: ``'[[hello, world], [Fortran, MFC]]'`` or ``'[[hello]]'``
 
 ### Data Flow
 
@@ -38,100 +40,102 @@ Note: Ordering is not guarennteed or stable, so use key-value pairing when using
 ### Computation Macros
 
 <details>
- <summary><code>GPU_PARALLEL_LOOP</code> <code>(Execute the following loop on the GPU in parallel)</code></summary>
+  <summary><code>GPU_PARALLEL_LOOP</code> -- <code>(Execute the following loop on the GPU in parallel)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_PARALLEL_LOOP(...)`
-
-#### Parameters
-
-> | name             | data type           | Default Value     | description                                                                               |
-> |------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
-> | `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                    |
-> | `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                              |
-> | `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
-> | `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
-> | `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
-> | `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
-> | `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
-> | `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
-> | `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
-> | `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
-> | `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
-> | `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
-> | `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
-> | `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
-> | `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
-> | `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
-> | `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
-
-#### Parameter Restrictions
-
-> | name          | Restricted range                                  |
-> |---------------|---------------------------------------------------|
-> | `collapse`    | Must be greater than 1                            |
-> | `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-> | `default`     | 'present' or 'none'                               |
-
-#### Additional information
+
+`$:GPU_PARALLEL_LOOP(...)`
+
+**Parameters**
+
+| name             | data type           | Default Value     | description                                                                               |
+|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
+| `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                    |
+| `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                              |
+| `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
+| `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
+| `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
+| `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
+| `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
+| `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
+| `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
+| `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
+| `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
+| `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
+
+**Parameter Restrictions**
+
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `collapse`    | Must be greater than 1                            |
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `default`     | 'present' or 'none'                               |
+
+**Additional information**
 
 - default present means that the any non-scalar data in assumed to be present on the GPU
 - default none means that the compiler should not implicitly determine the data attributes for any variable
 - reduction and reductionOp must match in length
-- With `reduction='[[sum1, sum2], [largest]]'` and `reductionOp='[+, max]'`, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
 - A reduction implies a copy, so it does not need to be added for both
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]')
->  $:GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]')
-> ```
+```python
+ $:GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]')
+ $:GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]')
+```
 
 </details>
 
 <details>
-  <summary><code>GPU_LOOP</code> <code>(Execute loop on GPU)</code></summary>
+  <summary><code>GPU_LOOP</code> -- <code>(Execute loop on GPU)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_LOOP(...)`
 
-#### Parameters
+`$:GPU_LOOP(...)`
+
+**Parameters**
 
-> | name              | data type           | Default Value | description                                                                                      |
-> |-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
-> | `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
-> | `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
-> | `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
-> | `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
-> | `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
-> | `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
-> | `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
+| name              | data type           | Default Value | description                                                                                      |
+|-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
+| `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
+| `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
+| `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
+| `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
+| `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
+| `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
+| `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
 
-#### Parameter Restrictions
+**Parameter Restrictions**
 
-> | name              | Restricted range                                  |
-> |-------------------|---------------------------------------------------|
-> | `collapse`        | Must be greater than 1                            |
-> | `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
-> | `data_dependency` | 'auto' or 'independent'                           |
+| name              | Restricted range                                  |
+|-------------------|---------------------------------------------------|
+| `collapse`        | Must be greater than 1                            |
+| `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `data_dependency` | 'auto' or 'independent'                           |
 
-#### Additional information
+**Additional information**
 
-- Loop parallelism is most commonly `'[seq]'`
+- Loop parallelism is most commonly ``'[seq]'``
 - reduction and reductionOp must match in length
-- With `reduction='[[sum1, sum2], [largest]]'` and `reductionOp='[+, max]'`, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_PARALLEL_LOOP(parallelism='[seq]')
->  $:GPU_PARALLEL_LOOP(collapse=3, parallelism='[seq]',private='[tmp, r]')
-> ```
+```python
+ $:GPU_LOOP(parallelism='[seq]')
+ $:GPU_LOOP(collapse=3, parallelism='[seq]',private='[tmp, r]')
+```
 
 </details>
 
@@ -140,213 +144,215 @@ Uses FYPP eval directive using `$:`
 ### Data Control Macros
 
 <details>
- <summary><code>GPU_DATA</code> <code>(Make data accessible on GPU in specified region)</code></summary>
+ <summary><code>GPU_DATA</code> -- <code>(Make data accessible on GPU in specified region)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP call directive using `#:call`
 
-> ```C
-> #:call GPU_DATA(...)
->    {code}
-> #:endcall GPU_DATA 
->```
-
-#### Parameters
-
-> | name             | data type   | Default Value | description                                                                                  |
-> |------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
-> | `code`           | code        | Required      | Region of code where defined data is accessible                                              |
-> | `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
-> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                    |
-> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
-> | `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit             |
-> | `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                               |
-> | `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
-> | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
-> | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
-> | `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
-> | `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
-> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
-
-#### Parameter Restrictions
-
-> | name   | Restricted range                                 |
-> |--------|--------------------------------------------------|
-> | `code` | Do not assign it manually with key-value pairing |
-
-#### Example
-
-> ```C
->  #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, initial_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
->       {code}
->       ...
->  #:endcall GPU_DATA
->  #:call GPU_DATA(create='[pixel_arr]', copyin='[initial_index]')
->       {code}
->       ...
->  #:endcall
-> ```
+```C
+#:call GPU_DATA(...)
+   {code}
+#:endcall GPU_DATA 
+```
+
+**Parameters**
+
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
+| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                    |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit             |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                               |
+| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
+| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+**Parameter Restrictions**
+
+| name   | Restricted range                                 |
+|--------|--------------------------------------------------|
+| `code` | Do not assign it manually with key-value pairing |
+
+**Example**
+
+```C
+ #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, initial_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+      {code}
+      ...
+ #:endcall GPU_DATA
+ #:call GPU_DATA(create='[pixel_arr]', copyin='[initial_index]')
+      {code}
+      ...
+ #:endcall
+```
 
 </details>
 
 <details>
- <summary><code>GPU_ENTER_DATA</code> <code>(Allocate/move data to GPU until matching GPU_EXIT_DATA or program termination)</code></summary>
+ <summary><code>GPU_ENTER_DATA</code> -- <code>(Allocate/move data to GPU until matching GPU_EXIT_DATA or program termination)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_ENTER_DATA(...)`
 
-#### Parameters
+`$:GPU_ENTER_DATA(...)`
 
-> | name             | data type   | Default Value | description                                                  |
-> |------------------|-------------|---------------|--------------------------------------------------------------|
-> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance                 |
-> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
-> | `create`         | string list | None          | Allocates data on GPU on entrance                            |
-> | `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
-> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+**Parameter**
 
-#### Example
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance                 |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
+| `create`         | string list | None          | Allocates data on GPU on entrance                            |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
 
-> ```python
->  $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, initial_index]')
->  $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[initial_index]')
-> ```
+**Example**
+
+```python
+ $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, initial_index]')
+ $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[initial_index]')
+```
 
 </details>
 
 <details>
- <summary><code>GPU_EXIT_DATA</code> <code>(Deallocate/move data from GPU created by GPU_ENTER_DATA)</code></summary>
+ <summary><code>GPU_EXIT_DATA</code> -- <code>(Deallocate/move data from GPU created by GPU_ENTER_DATA)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_EXIT_DATA(...)`
 
-#### Parameters
+`$:GPU_EXIT_DATA(...)`
+
+**Parameters**
 
-> | name           | data type   | Default Value | description                                                  |
-> |----------------|-------------|---------------|--------------------------------------------------------------|
-> | `copyout`      | string list | None          | Deallocates and copies data from GPU to CPU on exit          |
-> | `delete`       | string list | None          | Deallocates data on GPU on exit                              |
-> | `detach`       | string list | None          | Detach device pointer from device targets on exit            |
-> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
+| name           | data type   | Default Value | description                                                  |
+|----------------|-------------|---------------|--------------------------------------------------------------|
+| `copyout`      | string list | None          | Deallocates and copies data from GPU to CPU on exit          |
+| `delete`       | string list | None          | Deallocates data on GPU on exit                              |
+| `detach`       | string list | None          | Detach device pointer from device targets on exit            |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, initial_index]')
->  $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[initial_index]')
-> ```
+```python
+ $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, initial_index]')
+ $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[initial_index]')
+```
 
 </details>
 
 <details>
- <summary><code>GPU_DECLARE</code> <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+ <summary><code>GPU_DECLARE</code> -- <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_DECLARE(...)`
 
-#### Parameters
+`$:GPU_DECLARE(...)`
+
+**Parameters**
 
-> | name             | data type   | Default Value | description                                                                               |
-> |------------------|-------------|---------------|-------------------------------------------------------------------------------------------|
-> | `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
-> | `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
-> | `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit              |
-> | `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
-> | `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                            |
-> | `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
-> | `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                |
-> | `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                 |
-> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                              |
+| name             | data type   | Default Value | description                                                                               |
+|------------------|-------------|---------------|-------------------------------------------------------------------------------------------|
+| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit              |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                |
+| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                 |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                              |
 
-#### Additional information
+**Additional information**
 
 - An implicit data region is created at the start of each procedure and ends after the last executable statement in that procedure.
 - Use only create, copyin, device_resident or link clauses for module variables
 - GPU_DECLARE exit is the end of the implicit data region
 - Link is useful for large global static data objects
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
->  $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
-> ```
+```python
+ $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
+ $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
+```
 
 </details>
 
 <details>
- <summary><code>GPU_UPDATE</code> <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
+ <summary><code>GPU_UPDATE</code> -- <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_UPDATE(...)`
 
-#### Parameters
+`$:GPU_UPDATE(...)`
 
-> | name           | data type   | Default Value | description                                                  |
-> |----------------|-------------|---------------|--------------------------------------------------------------|
-> | `host`         | string list | None          | Updates data from GPU to CPU                                 |
-> | `device`       | string list | None          | Updates data from CPU to GPU                                 |
-> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
+**Parameters**
 
-#### Example
+| name           | data type   | Default Value | description                                                  |
+|----------------|-------------|---------------|--------------------------------------------------------------|
+| `host`         | string list | None          | Updates data from GPU to CPU                                 |
+| `device`       | string list | None          | Updates data from CPU to GPU                                 |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
 
-> ```python
->  $:GPU_UPDATE(host='[arr1, arr2]')
->  $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
-> ```
+**Example**
+
+```python
+ $:GPU_UPDATE(host='[arr1, arr2]')
+ $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
+```
 
 </details>
 
 <details>
- <summary><code>GPU_HOST_DATA</code> <code>(Make GPU memory address available on CPU)</code></summary>
+ <summary><code>GPU_HOST_DATA</code> -- <code>(Make GPU memory address available on CPU)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP call directive using `#:call`
 
-> ```C
-> #:call GPU_HOST_DATA(...)
->    {code}
-> #:endcall GPU_HOST_DATA 
->```
-
-#### Parameters
+```C
+ #:call GPU_HOST_DATA(...)
+    {code}
+ #:endcall GPU_HOST_DATA 
+```
 
-> | name           | data type   | Default Value | description                                                      |
-> |----------------|-------------|---------------|------------------------------------------------------------------|
-> | `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
-> | `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
-> | `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
+**Parameters**
 
-#### Parameter Restrictions
+| name           | data type   | Default Value | description                                                      |
+|----------------|-------------|---------------|------------------------------------------------------------------|
+| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
+| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
 
-> | name   | Restricted range                                 |
-> |--------|--------------------------------------------------|
-> | `code` | Do not assign it manually with key-value pairing |
+**Parameter Restrictions**
 
-#### Additional information
+| name   | Restricted range                                 |
+|--------|--------------------------------------------------|
+| `code` | Do not assign it manually with key-value pairing |
 
-#### Example
+**Example**
 
-> ```C
->  #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
->       {code}
->       ...
->  #:endcall GPU_HOST_DATA
->  #:call GPU_HOST_DATA(use_device='[display_arr]')
->       {code}
->       ...
->   #:endcall
-> ```
+```C
+ #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
+      {code}
+      ...
+ #:endcall GPU_HOST_DATA
+ #:call GPU_HOST_DATA(use_device='[display_arr]')
+      {code}
+      ...
+  #:endcall
+```
 
 </details>
 
@@ -355,49 +361,51 @@ Uses FYPP call directive using `#:call`
 ### Synchronization Macros
 
 <details>
- <summary><code>GPU_WAIT</code> <code>(Makes CPU wait for async GPU activities)</code></summary>
+ <summary><code>GPU_WAIT</code> -- <code>(Makes CPU wait for async GPU activities)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_WAIT(...)`
 
-#### Parameters
+`$:GPU_WAIT(...)`
+
+**Parameters**
 
-> | name           | data type | Default Value | description                                                  |
-> |----------------|-----------|---------------|--------------------------------------------------------------|
-> | `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_WAIT()
-> ```
+```python
+ $:GPU_WAIT()
+```
 
 </details>
 
 <details>
- <summary><code>GPU_ATOMIC</code> <code>(Do an atomic operation on the GPU)</code></summary>
+ <summary><code>GPU_ATOMIC</code> -- <code>(Do an atomic operation on the GPU)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_ATOMIC(...)`
 
-#### Parameters
+`$:GPU_ATOMIC(...)`
 
-> | name           | data type | Default Value | description                                                  |
-> |----------------|-----------|---------------|--------------------------------------------------------------|
-> | `atomic`       | string    | Required      | Which atomic operation is performed                          |
-> | `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+**Parameters**
 
-#### Parameter Restrictions
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `atomic`       | string    | Required      | Which atomic operation is performed                          |
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
 
-> | name     | Restricted range                        |
-> |----------|-----------------------------------------|
-> | `atomic` | 'read', 'write', 'update', or 'capture' |
+**Parameter Restrictions**
 
-#### Additional information
+| name     | Restricted range                        |
+|----------|-----------------------------------------|
+| `atomic` | 'read', 'write', 'update', or 'capture' |
+
+**Additional information**
 
 - read atomic is reading in a value
   - Ex: `v=x`
@@ -413,15 +421,15 @@ Uses FYPP eval directive using `$:`
       v=x
   ```
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_ATOMIC(atomic='update')
->  x = square(x)
->  $:GPU_ATOMIC(atomic='capture')
->  x = square(x)
->  v = x
-> ```
+```python
+ $:GPU_ATOMIC(atomic='update')
+ x = square(x)
+ $:GPU_ATOMIC(atomic='capture')
+ x = square(x)
+ v = x
+```
 
 </details>
 
@@ -430,64 +438,66 @@ Uses FYPP eval directive using `$:`
 ### Miscellaneous Macros
 
 <details>
- <summary><code>GPU_ROUTINE</code> <code>(Compile a procedure for the GPU)</code></summary>
+ <summary><code>GPU_ROUTINE</code> -- <code>(Compile a procedure for the GPU)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_ROUTINE(...)`
 
-#### Parameters
+`$:GPU_ROUTINE(...)`
+
+**Parameters**
 
-> | name            | data type   | Default Value | description                                                  |
-> |-----------------|-------------|---------------|--------------------------------------------------------------|
-> | `function_name` | string      | None          | Name of subroutine/function                                  |
-> | `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
-> | `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
-> | `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
-> | `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
+| name            | data type   | Default Value | description                                                  |
+|-----------------|-------------|---------------|--------------------------------------------------------------|
+| `function_name` | string      | None          | Name of subroutine/function                                  |
+| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
+| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
+| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
+| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
 
-#### Parameter Restrictions
+**Parameter Restrictions**
 
-> | name          | Restricted range                                  |
-> |---------------|---------------------------------------------------|
-> | `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
 
-#### Additional information
+**Additional information**
 
 - Function name only needs to be given when cray_inline is True
 - Future capability is to parse function header for function name
-- Routine parallelism is most commonly `'[seq]'`
+- Routine parallelism is most commonly ``'[seq]'``
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_ROUTINE(parallelism='[seq]')
->  $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
-> ```
+```python
+ $:GPU_ROUTINE(parallelism='[seq]')
+ $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
+```
 
 </details>
 
 <details>
- <summary><code>GPU_CACHE</code> <code>(Data to be cache in software-managed cache)</code></summary>
+ <summary><code>GPU_CACHE</code> -- <code>(Data to be cache in software-managed cache)</code></summary>
 
-#### Macro Invocation
+**Macro Invocation**
 
 Uses FYPP eval directive using `$:`
-> `$:GPU_CACHE(...)`
 
-#### Parameters
+`$:GPU_CACHE(...)`
+
+**Parameters**
 
-> | name             | data type   | Default Value | description                                                  |
-> |------------------|-------------|---------------|--------------------------------------------------------------|
-> | `cache`          | string list | Required      | Data that should to stored in cache                          |
-> | `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `cache`          | string list | Required      | Data that should to stored in cache                          |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
 
-#### Example
+**Example**
 
-> ```python
->  $:GPU_CACHE(cache='[pixels_arr]')
-> ```
+```python
+ $:GPU_CACHE(cache='[pixels_arr]')
+```
 
 </details>
 

From f976b66cc1c3cfd2f60cb816e220dee6ddc0f923 Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 2 Jul 2025 14:01:02 -0400
Subject: [PATCH 73/75] Added GPU_PARALLEL and replaced rest of acc directives

---
 docs/documentation/gpuParallelization.md | 62 +++++++++++++++++++
 src/common/include/parallel_macros.fpp   | 28 +++++++++
 src/common/m_mpi_common.fpp              | 79 +++++++++++++-----------
 src/simulation/m_data_output.fpp         |  9 ++-
 src/simulation/m_riemann_solvers.fpp     | 10 +--
 src/simulation/m_sim_helpers.fpp         |  4 +-
 src/simulation/m_time_steppers.fpp       |  4 +-
 7 files changed, 145 insertions(+), 51 deletions(-)

diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
index bb0fbc667d..8579914485 100644
--- a/docs/documentation/gpuParallelization.md
+++ b/docs/documentation/gpuParallelization.md
@@ -139,6 +139,68 @@ Uses FYPP eval directive using `$:`
 
 </details>
 
+<details>
+  <summary><code>GPU_PARALLEL</code> -- <code>(Execute the following on the GPU in parallel)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP call directive using `#:call`
+
+```C
+#:call GPU_PARALLEL(...)
+   {code}
+#:endcall GPU_PARALLEL 
+```
+
+**Parameters**
+
+| name             | data type           | Default Value     | description                                                                               |
+|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
+| `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
+| `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
+| `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
+| `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
+| `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
+| `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
+| `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
+| `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
+| `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
+| `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
+
+**Parameter Restrictions**
+
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `default`     | 'present' or 'none'                               |
+
+**Additional information**
+
+- default present means that the any non-scalar data in assumed to be present on the GPU
+- default none means that the compiler should not implicitly determine the data attributes for any variable
+- reduction and reductionOp must match in length
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- A reduction implies a copy, so it does not need to be added for both
+
+**Example**
+
+```C
+ #:call GPU_PARALLEL()
+      {code}
+      ...
+ #:endcall GPU_PARALLEL
+ #:call GPU_PARALLEL(create='[pixel_arr]', copyin='[initial_index]')
+      {code}
+      ...
+ #:endcall
+```
+
+</details>
+
 ------------------------------------------------------------------------------------------
 
 ### Data Control Macros
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 5b0b8bf564..8d0a5a673b 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -199,6 +199,34 @@
     $:use_device_val
 #:enddef
 
+#:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
+    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
+    #:set default_val = GEN_DEFAULT_STR(default)
+    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+    #:set copy_val = GEN_COPY_STR(copy)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+    #:set present_val = GEN_PRESENT_STR(present)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+    #:set attach_val = GEN_ATTACH_STR(attach)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
+        & copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + &
+        & deviceptr_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc parallel ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end parallel'
+    $:acc_directive
+    $:code
+    $:end_acc_directive
+#:enddef
+
 
 #:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', &
     & default='present', firstprivate=None, reduction=None, reductionOp=None, &
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index b920151488..568dddb299 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -25,7 +25,7 @@ module m_mpi_common
     implicit none
 
     integer, private :: ierr, v_size !<
-    !$acc declare create(v_size)
+    $:GPU_DECLARE(create='[v_size]')
     !! Generic flags used to identify and report MPI errors
 
     real(wp), private, allocatable, dimension(:) :: buff_send !<
@@ -38,10 +38,10 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    !$acc declare create(buff_send, buff_recv)
+    $:GPU_DECLARE(create='[buff_send, buff_recv]')
 
     integer :: halo_size
-    !$acc declare create(halo_size)
+    $:GPU_DECLARE(create='[halo_size]')
 
 contains
 
@@ -76,7 +76,7 @@ contains
             halo_size = -1 + buff_size*(v_size)
         end if
 
-        !$acc update device(halo_size, v_size)
+        $:GPU_UPDATE(device='[halo_size, v_size]')
 
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
 #endif
@@ -631,7 +631,7 @@ contains
                             /)
         end if
 
-        !$acc update device(v_size)
+        $:GPU_UPDATE(device='[v_size]')
 
         buffer_count = buffer_counts(mpi_dir)
         boundary_conditions = (/bc_x, bc_y, bc_z/)
@@ -667,7 +667,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -680,7 +680,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -695,7 +695,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -711,7 +711,7 @@ contains
                         end do
                     end if
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -726,7 +726,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -742,7 +742,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -759,7 +759,7 @@ contains
                         end do
                     end if
                 #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -774,7 +774,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -790,7 +790,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -816,28 +816,33 @@ contains
         #:for rdma_mpi in [False, True]
             if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
                 #:if rdma_mpi
-                    !$acc host_data use_device(buff_send, buff_recv)
-                    call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+                    #:call GPU_HOST_DATA(use_device='[buff_send, buff_recv]')
+                        call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+
+                        call MPI_SENDRECV( &
+                            buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                            buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                        call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    !$acc update host(buff_send)
+                    $:GPU_UPDATE(host='[buff_send]')
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
-                #:endif
 
-                call MPI_SENDRECV( &
-                    buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
-                    buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
-                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    call MPI_SENDRECV( &
+                        buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                        buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
-                call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+                    call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
 
-                #:if rdma_mpi
-                    !$acc end host_data
-                    !$acc wait
-                #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    !$acc update device(buff_recv)
+                    $:GPU_UPDATE(device='[buff_recv]')
                     call nvtxEndRange
                 #:endif
             end if
@@ -854,7 +859,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -874,7 +879,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -889,7 +894,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -905,7 +910,7 @@ contains
                         end do
                     end if
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, p
                             do k = -buff_size, -1
@@ -926,7 +931,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -942,7 +947,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -960,7 +965,7 @@ contains
                     end if
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -982,7 +987,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -999,7 +1004,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index e459c91831..4d702106f1 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -316,15 +316,14 @@ contains
             Rc_min_loc = minval(Rc_sf)
         end if
 #else
-        !$acc kernels
+        #:call GPU_PARALLEL()
         icfl_max_loc = maxval(icfl_sf)
-        !$acc end kernels
-
+        #:endcall GPU_PARALLEL
         if (viscous) then
-            !$acc kernels
+            #:call GPU_PARALLEL()
             vcfl_max_loc = maxval(vcfl_sf)
             Rc_min_loc = minval(Rc_sf)
-            !$acc end kernels
+            #:endcall GPU_PARALLEL
         end if
 #endif
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index d168acc752..092af7fbd7 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -748,7 +748,7 @@ contains
 
                             ! Momentum
                             if (mhd .and. (.not. relativity)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 3
                                     ! Flux of rho*v_i in the ${XYZ}$ direction
                                     ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
@@ -763,7 +763,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (mhd .and. relativity) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 3
                                     ! Flux of m_i in the ${XYZ}$ direction
                                     ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
@@ -850,7 +850,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
                             else if (hypoelasticity) then
                                 flux_tau_L = 0._wp; flux_tau_R = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
                                     flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
@@ -946,7 +946,7 @@ contains
                                 if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
                                     ! B_y flux = v_x * B_y - v_y * Bx0
                                     ! B_z flux = v_x * B_z - v_z * Bx0
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 0, 1
                                         flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
                                                                                - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
@@ -956,7 +956,7 @@ contains
                                     ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
                                     ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
                                     ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 0, 2
                                         flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( &
                                                                                     s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 11a16c6bcb..cf8cf80bd7 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -21,7 +21,7 @@ contains
         !! @param l z coordinate index
         !! @return fltr_dtheta Modified dtheta value for cylindrical coordinates
     pure function f_compute_filtered_dtheta(k, l) result(fltr_dtheta)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         integer, intent(in) :: k, l
         real(wp) :: fltr_dtheta
         integer :: Nfq
@@ -48,7 +48,7 @@ contains
         !! @param l z coordinate index
         !! @return cfl_terms computed CFL terms for 2D/3D cases
     pure function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c
         integer, intent(in) :: j, k, l
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 761ebdd792..8a9d692fff 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -993,9 +993,9 @@ contains
             end do
         end do
 
-        !$acc kernels
+        #:call GPU_PARALLEL()
         dt_local = minval(max_dt)
-        !$acc end kernels
+        #:endcall GPU_PARALLEL
 
         if (num_procs == 1) then
             dt = dt_local

From 2beb457c343c1e9b113e7df8a73caa737f723f3d Mon Sep 17 00:00:00 2001
From: Tanush Prathi <tanushp04@gmail.com>
Date: Wed, 2 Jul 2025 14:07:12 -0400
Subject: [PATCH 74/75] Ran formatter

---
 src/simulation/m_data_output.fpp   | 6 +++---
 src/simulation/m_time_steppers.fpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 4d702106f1..f233bb5374 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -317,12 +317,12 @@ contains
         end if
 #else
         #:call GPU_PARALLEL()
-        icfl_max_loc = maxval(icfl_sf)
+            icfl_max_loc = maxval(icfl_sf)
         #:endcall GPU_PARALLEL
         if (viscous) then
             #:call GPU_PARALLEL()
-            vcfl_max_loc = maxval(vcfl_sf)
-            Rc_min_loc = minval(Rc_sf)
+                vcfl_max_loc = maxval(vcfl_sf)
+                Rc_min_loc = minval(Rc_sf)
             #:endcall GPU_PARALLEL
         end if
 #endif
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 8a9d692fff..381455be2b 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -994,7 +994,7 @@ contains
         end do
 
         #:call GPU_PARALLEL()
-        dt_local = minval(max_dt)
+            dt_local = minval(max_dt)
         #:endcall GPU_PARALLEL
 
         if (num_procs == 1) then

From df54d6f1b4b21326605cc338f2ac2731af2f16e6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 6 Jul 2025 09:52:34 -0400
Subject: [PATCH 75/75] fix bug

---
 src/simulation/m_rhs.fpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 744d54c31b..5c16a5ba9f 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -173,8 +173,8 @@ contains
 
         integer :: num_eqns_after_adv
 
-        $:GPU_ENTER_DATA(copyin='[idwbuff,idwbuff]')
-        $:GPU_UPDATE(device='[idwbuff, idwbuff]')
+        $:GPU_ENTER_DATA(copyin='[idwbuff]')
+        $:GPU_UPDATE(device='[idwbuff]')
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
         @:ALLOCATE(q_prim_qp%vf(1:sys_size))