Add scripts for santis/alps, example case, and captures for UVM comms via RDMA

ntselepidis · ntselepidis · commit 2358d298f813 · 2025-08-01T18:04:49.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
                 )
 
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                 endif()
 
diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import math
+import json
+
+N = 799
+Nx = N
+Ny = 2*(N+1)-1
+Nz = 2*(N+1)-1
+
+Re = 1600
+L = 1
+P0 = 101325
+rho0 = 1
+C0 = math.sqrt(1.4 * P0)
+V0 = 0.1 * C0
+mu = V0 * L / Re
+
+cfl = 0.5
+dx = 2 * math.pi * L / (Ny + 1)
+
+dt = cfl * dx / (C0)
+
+tC = L / V0
+tEnd = 20 * tC
+
+Nt = int(tEnd / dt)
+Nt = 10
+
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            "rdma_mpi": "T",
+            # Logistics
+            "run_time_info": "F",
+            # Computational Domain Parameters
+            "x_domain%beg": -math.pi * L,
+            "x_domain%end": math.pi * L,
+            "y_domain%beg": -math.pi * L,
+            "y_domain%end": math.pi * L,
+            "z_domain%beg": -math.pi * L,
+            "z_domain%end": math.pi * L,
+            "m": Nx,
+            "n": Ny,
+            "p": Nz,
+            "cyl_coord": "F",
+            "dt": dt,
+            "t_step_start": 0,
+            "t_step_stop": Nt,
+            "t_step_save": int(Nt / 100),
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "num_fluids": 1,
+            "time_stepper": 3,
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            "igr": "T",
+            "igr_order": 5,
+            "igr_iter_solver": 1,
+            "num_igr_iters": 3,
+            "num_igr_warm_start_iters": 3,
+            "alf_factor": 10,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(1)": "T",
+            "omega_wrt(2)": "T",
+            "omega_wrt(3)": "T",
+            "qm_wrt": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Background (AIR - 2)
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0,
+            "patch_icpp(1)%y_centroid": 0,
+            "patch_icpp(1)%z_centroid": 0,
+            "patch_icpp(1)%length_x": 2 * math.pi * L,
+            "patch_icpp(1)%length_y": 2 * math.pi * L,
+            "patch_icpp(1)%length_z": 2 * math.pi * L,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%vel(3)": 0,
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
+            "patch_icpp(1)%alpha_rho(1)": 1,
+            "patch_icpp(1)%alpha(1)": 1,
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 1 / mu,
+        }
+    )
+)
diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( first core per socket )
+physcores=(0 72 144 216)
+
+#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+
+#set -x
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
+#set +x
diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
@@ -38,7 +38,9 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
+#endif
 
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
 
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+#else
+        ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+        !$acc enter data create(capture:buff_send)
+        !$acc enter data create(capture:buff_recv)
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
@@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str:
         # The install directory is located <root>/build/install/<slug>
         return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
 
+    def get_home_dirpath(self, case: Case) -> str:
+        return os.sep.join([os.getcwd()])
+
     def get_install_binpath(self, case: Case ) -> str:
         # <root>/install/<slug>/bin/<target>
         return os.sep.join([self.get_install_dirpath(case), "bin", self.name])
diff --git a/toolchain/modules b/toolchain/modules
@@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0
 n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6
 n-gpu CC=nvc CXX=nvc++ FC=nvfortran
 
+san   CSCS Santis
+san-all cmake python
+san-gpu nvhpc cuda cray-mpich
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --uenv=icon/25.2:v1
+#SBATCH --nodes=${nodes}
+#SBATCH --reservation=g183
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --job-name="${name}"
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+#SBATCH --time=${walltime}
+% if account:
+#SBATCH --account=${account}
+% endif
+% if partition:
+#SBATCH --partition=${partition}
+% endif
+% if quality_of_service:
+#SBATCH --qos=${quality_of_service}
+% endif
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+# NVHPC and CUDA env vars
+export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified )
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
+#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+export MPICH_NO_BUFFER_ALIAS_CHECK=1
+
+# CUSTOM env vars to MFC
+export NVIDIA_ALLOC_MODE=0                    # do nothing
+export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
+export NVIDIA_IGR_TEMPS_ON_GPU=3              # jac, jac_rhs, and jac_old on GPU
+export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU
+
+# NSYS
+export NSYS=1                                 # enable nsys profiling
+export NSYS_FILE=myreport.qdrep
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+% if engine == 'batch':
+. ./mfc.sh load -c san -m ${'g' if gpu else 'c'}
+% endif
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; srun --unbuffered \
+                --ntasks=${nodes*tasks_per_node}                     \
+                --cpus-per-task 1                                    \
+                --cpu-bind=none                                      \
+            % if gpu:
+                --gpus-per-task 1                                    \
+            % endif
+                --wait 200 --bcast=/tmp/${target.name}               \
+                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \
+            % if target.name == 'simulation':
+                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \
+            % endif
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}

Original file line number	Diff line number	Diff line change
`@@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)`
`486`	`486`	`endforeach()`
`487`	`487`
`488`	`488`	`target_compile_options(${a_target}`
`489`		`- PRIVATE -gpu=keep,ptxinfo,lineinfo`
	`489`	`+ PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath`
`490`	`490`	`)`
`491`	`491`
`492`	`492`	`# GH-200 Unified Memory Support`
`493`	`493`	`if (MFC_Unified)`
`494`	`494`	`target_compile_options(${ARGS_TARGET}`
`495`		`- PRIVATE -gpu=unified`
	`495`	`+ PRIVATE -gpu=mem:unified -cuda`
`496`	`496`	`)`
`497`	`497`	`# "This option must appear in both the compile and link lines" -- NVHPC Docs`
`498`	`498`	`target_link_options(${ARGS_TARGET}`
`499`		`- PRIVATE -gpu=unified`
	`499`	`+ PRIVATE -gpu=mem:unified -cuda`
`500`	`500`	`)`
`501`	`501`	`endif()`
`502`	`502`