Test Suite Flag --rdma-mpi Implemented (#598) (#878)

Malmahrouqi3 · sbryngelson · Mohammed Said Hamed Humaid Al-Mahrouqi · web-flow · commit 37cdc6ae0e3b · 2025-07-29T19:39:08.000-04:00
Co-authored-by: mohdsaid497566 &lt;145478595+mohdsaid497566@users.noreply.github.com&gt;
Co-authored-by: Spencer Bryngelson &lt;sbryngelson@gmail.com&gt;
Co-authored-by: Mohammed Said Hamed Humaid Al-Mahrouqi &lt;malmahrouqi3@atl1-1-02-002-27-1.pace.gatech.edu&gt;
Co-authored-by: mohdsaid497566 &lt;mohdsaid497566@gmail.com&gt;
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
@@ -13,5 +13,6 @@ if [ "$2" == "bench" ]; then
         ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
     done
 else
-    ./mfc.sh test --dry-run -j 8 $build_opts
+    ./mfc.sh test -a --dry-run --rdma-mpi --generate -j 8 $build_opts
 fi
+
diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh
@@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n'
 ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
 
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier
+    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
 else
-    ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier
+    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier
 fi
diff --git a/docs/documentation/testing.md b/docs/documentation/testing.md
@@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint
 - `--percent` (`%`) to specify a percentage of the test suite to select at random and test
 - `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed
 - `--no-examples` skips the testing of cases in the examples folder
+- `--rdma-mpi` runs additional tests where RDMA MPI is enabled.
 
 To specify a computer, pass the `-c` flag to `./mfc.sh run` like so:
 ```shell
diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py
@@ -83,14 +83,15 @@ def add_common_arguments(p, mask = None):
     test.add_argument("-l", "--list",         action="store_true", help="List all available tests.")
     test.add_argument("-f", "--from",         default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.")
     test.add_argument("-t", "--to",           default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.")
-    test.add_argument("-o", "--only",         nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
-    test.add_argument("-a", "--test-all",     action="store_true", default=False, help="Run the Post Process Tests too.")
-    test.add_argument("-%", "--percent",      type=int, default=100, help="Percentage of tests to run.")
-    test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
-    test.add_argument(      "--no-build",     action="store_true",                    default=False,      help="(Testing) Do not rebuild MFC.")
-    test.add_argument(      "--no-examples",  action="store_true",                    default=False,      help="Do not test example cases." )
-    test.add_argument("--case-optimization",  action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
-    test.add_argument(      "--dry-run",      action="store_true",                    default=False,      help="Build and generate case files but do not run tests.")
+    test.add_argument("-o", "--only",         nargs="+", type=str,     default=[], metavar="L", help="Only run tests with specified properties.")
+    test.add_argument("-a", "--test-all",     action="store_true",     default=False,     help="Run the Post Process Tests too.")
+    test.add_argument("-%", "--percent",      type=int,                default=100,       help="Percentage of tests to run.")
+    test.add_argument("-m", "--max-attempts", type=int,                default=1,         help="Maximum number of attempts to run a test.")
+    test.add_argument(      "--rdma-mpi",     action="store_true",     default=False,     help="Run tests with RDMA MPI enabled")
+    test.add_argument(      "--no-build",     action="store_true",     default=False,     help="(Testing) Do not rebuild MFC.")
+    test.add_argument(      "--no-examples",  action="store_true",     default=False,     help="Do not test example cases." )
+    test.add_argument("--case-optimization",  action="store_true",     default=False,     help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
+    test.add_argument(      "--dry-run",      action="store_true",     default=False,     help="Build and generate case files but do not run tests.")
 
     test_meg = test.add_mutually_exclusive_group()
     test_meg.add_argument("--generate",          action="store_true", default=False, help="(Test Generation) Generate golden files.")
diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py
@@ -132,7 +132,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
         filepath          = f'{self.get_dirpath()}/case.py'
         tasks             = ["-n", str(self.ppn)]
         jobs              = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else []
-        case_optimization = ["--case-optimization"] if ARG("case_optimization") else []
+        case_optimization = ["--case-optimization"]  if ARG("case_optimization") else []
 
         if self.params.get("bubbles_lagrange", 'F') == 'T':
             input_bubbles_lagrange(self)
diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py
@@ -346,9 +346,10 @@ def alter_3d():
     def alter_ppn(dimInfo):
         if len(dimInfo[0]) == 3:
             cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2))
+            cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2))
         else:
             cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2))
-
+            cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2))
 
     def alter_ib(dimInfo, six_eqn_model=False):
         for slip in [True, False]:
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
@@ -58,7 +58,12 @@ def __filter(cases_) -> typing.List[TestCase]:
         if case.ppn > 1 and not ARG("mpi"):
             cases.remove(case)
             skipped_cases.append(case)
-    
+
+    for case in cases[:]:
+        if "RDMA MPI" in case.trace:
+            cases.remove(case)
+            skipped_cases.append(case)
+
     for case in cases[:]:
         if ARG("single"):
             skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6'
@@ -191,6 +196,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
         return
 
     cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
+
     out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
 
     common.file_write(out_filepath, cmd.stdout)