MFlowCode
diff --git a/‎.github/workflows/frontier/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/frontier/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/frontier/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/frontier/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pr_agent.toml‎
Lines changed: 4 additions & 4 deletions b/‎.pr_agent.toml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/scaling/FRONTIER_BENCH.md‎
Lines changed: 92 additions & 0 deletions b/‎examples/scaling/FRONTIER_BENCH.md‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎examples/scaling/README.md‎
Lines changed: 3 additions & 4 deletions b/‎examples/scaling/README.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/scaling/analyze.py‎
Lines changed: 177 additions & 0 deletions b/‎examples/scaling/analyze.py‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎examples/scaling/build.sh‎
100644100755
Lines changed: 3 additions & 1 deletion b/‎examples/scaling/build.sh‎
100644100755
Lines changed: 3 additions & 1 deletion
@@ -13,6 +13,6 @@ if [ "$2" == "bench" ]; then
         ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
     done
 else
-    ./mfc.sh test -a --dry-run --rdma-mpi --generate -j 8 $build_opts
+    ./mfc.sh test -a --dry-run --rdma-mpi -j 8 $build_opts
 fi
 
@@ -6,5 +6,5 @@ ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
 if [ "$job_device" = "gpu" ]; then
     ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
 else
-    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier
+    ./mfc.sh test -a --max-attempts 3 -j 32 -- -c frontier
 fi
@@ -53,7 +53,7 @@ jobs:
         run:  |
           brew update
           brew upgrade
-          brew install coreutils python cmake fftw hdf5 gcc@15 boost open-mpi lapack
+          brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
           echo "FC=gfortran-15" >> $GITHUB_ENV
           echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
 
 
@@ -4,12 +4,12 @@
 pr_commands = ["/describe", "/review", "/improve"]
 
 [pr_reviewer]                # (all fields optional)
-num_max_findings        = 5  # how many items to surface
-require_tests_review     = true
+num_max_findings        = 10  # how many items to surface
+require_tests_review    = true
 extra_instructions = """
 Focus on duplicate code, the possibility of bugs, and if the PR added appropriate tests if it added a simulation feature.
 """
 
 [pr_code_suggestions]
-commitable_code_suggestions = false   # purely advisory, no write ops
-apply_suggestions_checkbox  = false   # hides the “Apply/Chat” boxes
+commitable_code_suggestions = true
+apply_suggestions_checkbox  = true
@@ -28,7 +28,7 @@
 **Welcome!**
 MFC simulates compressible multi-phase flows, [among other things](#what-else-can-this-thing-do). 
 It uses metaprogramming to stay short and portable (~20K lines).
-MFC conducted the largest known, open CFD simulation at <a href="https://arxiv.org/abs/2505.07392" target="_blank">101 trillion grid points</a> (as of July 2025).
+MFC conducted the largest known, open CFD simulation at <a href="https://arxiv.org/abs/2505.07392" target="_blank">200 trillion grid points</a>, and 1 quadrillion degrees of freedom (as of September 2025), and is a 2025 Gordon Bell Prize finalist.
 
 <p align="center">
 <a href="https://doi.org/10.48550/arXiv.2503.07953" target="_blank">
@@ -187,7 +187,7 @@ They are organized below.
 
 * GPU compatible on NVIDIA ([P/V/A/H]100, GH200, etc.) and AMD (MI[1/2/3]00+) GPU and APU hardware
 * Ideal weak scaling to 100% of the largest GPU and superchip supercomputers
- 	* \>36K AMD APUs (MI300A) on [LLNL El Capitan](https://hpc.llnl.gov/hardware/compute-platforms/el-capitan)
+ 	* \>43K AMD APUs (MI300A) on [LLNL El Capitan](https://hpc.llnl.gov/hardware/compute-platforms/el-capitan)
    	* \>3K AMD APUs (MI300A) on [LLNL Tuolumne](https://hpc.llnl.gov/hardware/compute-platforms/tuolumne)
 	* \>33K AMD GPUs (MI250X) on [OLCF Frontier](https://www.olcf.ornl.gov/frontier/) 
 	* \>10K NVIDIA GPUs (V100) on [OLCF Summit](https://www.olcf.ornl.gov/summit/) 
@@ -199,7 +199,7 @@ They are organized below.
 
 * [Fypp](https://fypp.readthedocs.io/en/stable/fypp.html) metaprogramming for code readability, performance, and portability
 * Continuous Integration (CI)
-	* > 500 Regression tests with each PR.
+	* \>500 Regression tests with each PR.
  		* Performed with GNU (GCC), Intel (oneAPI), Cray (CCE), and NVIDIA (NVHPC) compilers on NVIDIA and AMD GPUs.
 		* Line-level test coverage reports via [Codecov](https://app.codecov.io/gh/MFlowCode/MFC) and `gcov`
 	* Benchmarking to avoid performance regressions and identify speed-ups
 
@@ -0,0 +1,92 @@
+# Description
+
+The scripts and case file in this directory are set up to benchmarking strong
+and weak scaling performance as well as single device absolute performance on
+OLCF Frontier. The case file is for a three dimensional, two fluid liquid--gas
+problem without viscosity or surface tension. The scripts contained here have
+been tested for the default node counts and problem sizes in the scripts. The
+reference data in `reference.dat` also makes use of the default node counts and
+problem sizes and will need to be regenerated if either changes. The benchmarks
+can be run with the following steps:
+
+## Getting the code
+
+The code is hosted on GitHub and can be cloned with the following command:
+
+```bash
+git clone [email protected]:MFlowCode/MFC.git; cd MFC; chmod u+x examples/scaling/*.sh;
+```
+
+The above command clones the repository, changes directories in the repository
+root, and makes the benchmark scripts executable.
+
+## Running the benchmarks
+
+### Step 1: Building
+
+The code for the benchmarks is built with the following command
+```
+./examples/scaling/build.sh
+```
+
+### Step 2: Running
+
+The benchmarks can be run in their default configuration with the following
+```
+./examples/scaling/submit_all.sh --account <account_name>
+```
+By default this will submit the following jobs for benchmarking
+
+| Job                | Nodes | Description                                                         |
+| ------------------ | ----- | ------------------------------------------------------------------- |
+| `MFC-W-16-64`      | 16    | Weak scaling calculation with a ~64GB problem per GCD on 16 nodes   |
+| `MFC-W-128-64`     | 128   | Weak scaling calculation with a ~64GB problem per GCD on 128 nodes  |
+| `MFC-W-1024-64`    | 1024  | Weak scaling calculation with a ~64GB problem per GCD on 1024 nodes |
+| `MFC-W-8192-64`    | 8192  | Weak scaling calculation with a ~64GB problem per GCD on 8192 nodes |
+| `MFC-S-8-4096`     | 8     | Strong scaling calculation with a ~4096GB problem on 8 nodes        |
+| `MFC-S-64-4096`    | 64    | Strong scaling calculation with a ~4096GB problem on 64 nodes       |
+| `MFC-S-512-4096`   | 512   | Strong scaling calculation with a ~4096GB problem on 512 nodes      |
+| `MFC-S-4096-4096`  | 4096  | Strong scaling calculation with a ~4096GB problem on 4096 nodes     |
+| `MFC-G-8`          | 1     | Single device grind time calculation with ~8GB per GCD              |
+| `MFC-G-16`         | 1     | Single device grind time calculation with ~16GB per GCD             |
+| `MFC-G-32`         | 1     | Single device grind time calculation with ~32GB per GCD             |
+| `MFC-G-64`         | 1     | Single device grind time calculation with ~64GB per GCD             |
+Strong and weak scaling cases run `pre_process` once and then run `simulation`
+with and without GPU-aware MPI in a single job. Individual benchmarks can be run
+by calling the `submit_[strong,weak,grind].sh` scripts directly, or modifying
+the `submit_all.sh` script to fit your needs.
+
+#### Modifying the benchmarks
+The submitted jobs can be modified by appending options to the `submit_all.sh`
+script. For examples, appending
+```
+--nodes "1,2,4,8"
+```
+to the `submit_strong.sh` and `submit_weak.sh` scripts will run the strong and
+weak scaling benchmarks on 1, 2, 4, and 8 nodes. Appending
+```
+--mem "x,y"
+```
+will modify the approximate problem size in terms of GB of memory
+(see the `submit_[strong,weak,grind].sh` for details on what this number refers
+to for the different types of tests).
+
+### Step 3: Post processing
+
+The log files can be post processed into a more human readable format with
+```
+python3 examples/scaling/analyze.py
+```
+This Python script generates a table of results in the command line with
+comparison to the reference data in `reference.dat`. The `rel_perf` column
+compares the raw run times of the current results to the reference data.
+Relative performance numbers small than 1.0 indicate a speedup and numbers larger
+than one indicate a slowdown relative to the reference data. The selected problem
+sizes are intended to be comparable to the tiny, small, medium, and large labels
+used by the SpecHPC benchmark.
+
+## Common errors
+
+The only common failure point identified during testing were "text file busy"
+errors causing job failures. These errors are intermittent and are usually
+resolved by resubmitting the test.
@@ -1,11 +1,10 @@
-# Strong- & Weak-scaling
+# Scaling and Performance test
 
 The scaling case can exercise both weak- and strong-scaling. It
 adjusts itself depending on the number of requested ranks.
 
-This directory also contains a collection of scripts used to test strong-scaling
-on OLCF Frontier. They required modifying MFC to collect some metrics but are
-meant to serve as a reference to users wishing to run similar experiments.
+This directory also contains a collection of scripts used to test strong and weak
+scaling on OLCF Frontier.
 
 ## Weak Scaling
 
 
@@ -0,0 +1,177 @@
+import os, re
+import pandas as pd
+from io import StringIO
+
+
+def parse_time_avg(path):
+    last_val = None
+    pattern = re.compile(r"Time Avg =\s*([0-9.E+-]+)")
+    with open(path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                last_val = float(match.group(1))
+    return last_val
+
+
+def parse_grind_time(path):
+    last_val = None
+    pattern = re.compile(r"Performance: \s*([0-9.E+-]+)")
+    with open(path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                last_val = float(match.group(1))
+    return last_val
+
+
+def parse_reference_file(filename):
+    with open(filename) as f:
+        content = f.read()
+
+    records = []
+    blocks = re.split(r"\n(?=Weak|Strong|Grind)", content.strip())
+
+    for block in blocks:
+        lines = block.strip().splitlines()
+        header = lines[0].strip()
+        body = "\n".join(lines[1:])
+
+        df = pd.read_csv(StringIO(body), delim_whitespace=True)
+
+        if header.startswith("Weak Scaling"):
+            # Parse metadata from header
+            mem_match = re.search(r"Memory: ~(\d+)GB", header)
+            rdma_match = re.search(r"RDMA: (\w)", header)
+            memory = int(mem_match.group(1)) if mem_match else None
+            rdma = rdma_match.group(1) if rdma_match else None
+
+            for _, row in df.iterrows():
+                records.append({"scaling": "weak", "nodes": int(row["nodes"]), "memory": memory, "rdma": rdma, "phase": "sim", "time_avg": row["time_avg"], "efficiency": row["efficiency"]})
+
+        elif header.startswith("Strong Scaling"):
+            mem_match = re.search(r"Memory: ~(\d+)GB", header)
+            rdma_match = re.search(r"RDMA: (\w)", header)
+            memory = int(mem_match.group(1)) if mem_match else None
+            rdma = rdma_match.group(1) if rdma_match else None
+
+            for _, row in df.iterrows():
+                records.append(
+                    {
+                        "scaling": "strong",
+                        "nodes": int(row["nodes"]),
+                        "memory": memory,
+                        "rdma": rdma,
+                        "phase": "sim",
+                        "time_avg": row["time_avg"],
+                        "speedup": row["speedup"],
+                        "efficiency": row["efficiency"],
+                    }
+                )
+
+        elif header.startswith("Grind Time"):
+            for _, row in df.iterrows():
+                records.append({"scaling": "grind", "memory": int(row["memory"]), "grind_time": row["grind_time"]})
+
+    return pd.DataFrame(records)
+
+
+# Get log files and filter for simulation logs
+files = os.listdir("examples/scaling/logs/")
+files = [f for f in files if "sim" in f]
+
+records = []
+for fname in files:
+    # Remove extension
+    parts = fname.replace(".out", "").split("-")
+    scaling, nodes, memory, rdma, phase = parts
+    records.append({"scaling": scaling, "nodes": int(nodes), "memory": int(memory), "rdma": rdma, "phase": phase, "file": fname})
+
+df = pd.DataFrame(records)
+
+ref_data = parse_reference_file("examples/scaling/reference.dat")
+
+print()
+
+weak_df = df[df["scaling"] == "weak"]
+strong_df = df[df["scaling"] == "strong"]
+grind_df = df[df["scaling"] == "grind"]
+
+weak_ref_df = ref_data[ref_data["scaling"] == "weak"]
+strong_ref_df = ref_data[ref_data["scaling"] == "strong"]
+grind_ref_df = ref_data[ref_data["scaling"] == "grind"]
+
+weak_scaling_mem = weak_df["memory"].unique()
+weak_scaling_rdma = weak_df["rdma"].unique()
+
+for mem in weak_scaling_mem:
+    for rdma in weak_scaling_rdma:
+        subset = weak_df[(weak_df["memory"] == mem) & (weak_df["rdma"] == rdma)]
+        subset = subset.sort_values(by="nodes")
+        ref = weak_ref_df[(weak_ref_df["memory"] == mem) & (weak_ref_df["rdma"] == rdma) & (weak_ref_df["nodes"].isin(subset["nodes"]))]
+        ref = ref.sort_values(by="nodes")
+
+        times = []
+        for _, row in subset.iterrows():
+            time_avg = parse_time_avg(os.path.join("examples/scaling/logs", row["file"]))
+            times.append(time_avg)
+
+        subset = subset.copy()
+        ref = ref.copy()
+        subset["time_avg"] = times
+        base_time = subset.iloc[0]["time_avg"]
+
+        subset["efficiency"] = base_time / subset["time_avg"]
+        subset["rel_perf"] = subset["time_avg"] / ref["time_avg"].values
+        print(f"Weak Scaling - Memory: ~{mem}GB, RDMA: {rdma}")
+        print(subset[["nodes", "time_avg", "efficiency", "rel_perf"]].to_string(index=False))
+        print()
+
+strong_scaling_mem = strong_df["memory"].unique()
+strong_scaling_rdma = strong_df["rdma"].unique()
+
+for mem in strong_scaling_mem:
+    for rdma in strong_scaling_rdma:
+        subset = strong_df[(strong_df["memory"] == mem) & (strong_df["rdma"] == rdma)]
+        subset = subset.sort_values(by="nodes")
+
+        ref = strong_ref_df[(strong_ref_df["memory"] == mem) & (strong_ref_df["rdma"] == rdma) & (strong_ref_df["nodes"].isin(subset["nodes"]))]
+        ref = ref.sort_values(by="nodes")
+
+        times = []
+        for _, row in subset.iterrows():
+            time_avg = parse_time_avg(os.path.join("examples/scaling/logs", row["file"]))
+            times.append(time_avg)
+
+        subset = subset.copy()
+        ref = ref.copy()
+        subset["time_avg"] = times
+        base_time = subset.iloc[0]["time_avg"]
+
+        subset["speedup"] = base_time / subset["time_avg"]
+        subset["efficiency"] = base_time / ((subset["nodes"] / subset.iloc[0]["nodes"]) * subset["time_avg"])
+        subset["rel_perf"] = subset["time_avg"] / ref["time_avg"].values
+        print(f"Strong Scaling - Memory: ~{mem}GB, RDMA: {rdma}")
+        print(subset[["nodes", "time_avg", "speedup", "efficiency", "rel_perf"]].to_string(index=False))
+        print()
+
+if not grind_df.empty:
+    grind_mem = grind_df["memory"].unique()
+    subset = grind_df.sort_values(by="memory")
+    ref = grind_ref_df[(grind_ref_df["memory"].isin(subset["memory"]))]
+    ref = ref.sort_values(by="memory")
+
+    times = []
+    for _, row in subset.iterrows():
+        grind_time = parse_grind_time(os.path.join("examples/scaling/logs", row["file"]))
+        times.append(grind_time)
+
+    subset = subset.copy()
+    ref = ref.copy()
+
+    subset["grind_time"] = times
+    subset["rel_perf"] = subset["grind_time"] / ref["grind_time"].values
+    print(f"Grind Time - Single Device")
+    print(subset[["memory", "grind_time", "rel_perf"]].to_string(index=False))
+
+print()
@@ -1,4 +1,6 @@
 #!/bin/bash
 
+. ./mfc.sh load -c f -m g
+
 ./mfc.sh build -t pre_process simulation --case-optimization -i examples/scaling/case.py \
-               -j 8 --gpu --mpi --no-debug -- -s strong -m 512
+               -j 8 --gpu --mpi --no-debug -- -s strong -m 512