diff --git a/examples/scaling/FRONTIER_BENCH.md b/examples/scaling/FRONTIER_BENCH.md
new file mode 100644
index 0000000000..b2792ddaa5
--- /dev/null
+++ b/examples/scaling/FRONTIER_BENCH.md
@@ -0,0 +1,92 @@
+# Description
+
+The scripts and case file in this directory are set up to benchmarking strong
+and weak scaling performance as well as single device absolute performance on
+OLCF Frontier. The case file is for a three dimensional, two fluid liquid--gas
+problem without viscosity or surface tension. The scripts contained here have
+been tested for the default node counts and problem sizes in the scripts. The
+reference data in `reference.dat` also makes use of the default node counts and
+problem sizes and will need to be regenerated if either changes. The benchmarks
+can be run with the following steps:
+
+## Getting the code
+
+The code is hosted on GitHub and can be cloned with the following command:
+
+```bash
+git clone git@github.com:MFlowCode/MFC.git; cd MFC; chmod u+x examples/scaling/*.sh;
+```
+
+The above command clones the repository, changes directories in the repository
+root, and makes the benchmark scripts executable.
+
+## Running the benchmarks
+
+### Step 1: Building
+
+The code for the benchmarks is built with the following command
+```
+./examples/scaling/build.sh
+```
+
+### Step 2: Running
+
+The benchmarks can be run in their default configuration with the following
+```
+./examples/scaling/submit_all.sh --account <account_name>
+```
+By default this will submit the following jobs for benchmarking
+
+| Job                | Nodes | Description                                                         |
+| ------------------ | ----- | ------------------------------------------------------------------- |
+| `MFC-W-16-64`      | 16    | Weak scaling calculation with a ~64GB problem per GCD on 16 nodes   |
+| `MFC-W-128-64`     | 128   | Weak scaling calculation with a ~64GB problem per GCD on 128 nodes  |
+| `MFC-W-1024-64`    | 1024  | Weak scaling calculation with a ~64GB problem per GCD on 1024 nodes |
+| `MFC-W-8192-64`    | 8192  | Weak scaling calculation with a ~64GB problem per GCD on 8192 nodes |
+| `MFC-S-8-4096`     | 8     | Strong scaling calculation with a ~4096GB problem on 8 nodes        |
+| `MFC-S-64-4096`    | 64    | Strong scaling calculation with a ~4096GB problem on 64 nodes       |
+| `MFC-S-512-4096`   | 512   | Strong scaling calculation with a ~4096GB problem on 512 nodes      |
+| `MFC-S-4096-4096`  | 4096  | Strong scaling calculation with a ~4096GB problem on 4096 nodes     |
+| `MFC-G-8`          | 1     | Single device grind time calculation with ~8GB per GCD              |
+| `MFC-G-16`         | 1     | Single device grind time calculation with ~16GB per GCD             |
+| `MFC-G-32`         | 1     | Single device grind time calculation with ~32GB per GCD             |
+| `MFC-G-64`         | 1     | Single device grind time calculation with ~64GB per GCD             |
+Strong and weak scaling cases run `pre_process` once and then run `simulation`
+with and without GPU-aware MPI in a single job. Individual benchmarks can be run
+by calling the `submit_[strong,weak,grind].sh` scripts directly, or modifying
+the `submit_all.sh` script to fit your needs.
+
+#### Modifying the benchmarks
+The submitted jobs can be modified by appending options to the `submit_all.sh`
+script. For examples, appending
+```
+--nodes "1,2,4,8"
+```
+to the `submit_strong.sh` and `submit_weak.sh` scripts will run the strong and
+weak scaling benchmarks on 1, 2, 4, and 8 nodes. Appending
+```
+--mem "x,y"
+```
+will modify the approximate problem size in terms of GB of memory
+(see the `submit_[strong,weak,grind].sh` for details on what this number refers
+to for the different types of tests).
+
+### Step 3: Post processing
+
+The log files can be post processed into a more human readable format with
+```
+python3 examples/scaling/analyze.py
+```
+This Python script generates a table of results in the command line with
+comparison to the reference data in `reference.dat`. The `rel_perf` column
+compares the raw run times of the current results to the reference data.
+Relative performance numbers small than 1.0 indicate a speedup and numbers larger
+than one indicate a slowdown relative to the reference data. The selected problem
+sizes are intended to be comparable to the tiny, small, medium, and large labels
+used by the SpecHPC benchmark.
+
+## Common errors
+
+The only common failure point identified during testing were "text file busy"
+errors causing job failures. These errors are intermittent and are usually
+resolved by resubmitting the test.
diff --git a/examples/scaling/README.md b/examples/scaling/README.md
index 9a81af5aab..fab674a690 100644
--- a/examples/scaling/README.md
+++ b/examples/scaling/README.md
@@ -1,11 +1,10 @@
-# Strong- & Weak-scaling
+# Scaling and Performance test
 
 The scaling case can exercise both weak- and strong-scaling. It
 adjusts itself depending on the number of requested ranks.
 
-This directory also contains a collection of scripts used to test strong-scaling
-on OLCF Frontier. They required modifying MFC to collect some metrics but are
-meant to serve as a reference to users wishing to run similar experiments.
+This directory also contains a collection of scripts used to test strong and weak
+scaling on OLCF Frontier.
 
 ## Weak Scaling
 
diff --git a/examples/scaling/analyze.py b/examples/scaling/analyze.py
new file mode 100644
index 0000000000..ab6f8067bd
--- /dev/null
+++ b/examples/scaling/analyze.py
@@ -0,0 +1,177 @@
+import os, re
+import pandas as pd
+from io import StringIO
+
+
+def parse_time_avg(path):
+    last_val = None
+    pattern = re.compile(r"Time Avg =\s*([0-9.E+-]+)")
+    with open(path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                last_val = float(match.group(1))
+    return last_val
+
+
+def parse_grind_time(path):
+    last_val = None
+    pattern = re.compile(r"Performance: \s*([0-9.E+-]+)")
+    with open(path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                last_val = float(match.group(1))
+    return last_val
+
+
+def parse_reference_file(filename):
+    with open(filename) as f:
+        content = f.read()
+
+    records = []
+    blocks = re.split(r"\n(?=Weak|Strong|Grind)", content.strip())
+
+    for block in blocks:
+        lines = block.strip().splitlines()
+        header = lines[0].strip()
+        body = "\n".join(lines[1:])
+
+        df = pd.read_csv(StringIO(body), delim_whitespace=True)
+
+        if header.startswith("Weak Scaling"):
+            # Parse metadata from header
+            mem_match = re.search(r"Memory: ~(\d+)GB", header)
+            rdma_match = re.search(r"RDMA: (\w)", header)
+            memory = int(mem_match.group(1)) if mem_match else None
+            rdma = rdma_match.group(1) if rdma_match else None
+
+            for _, row in df.iterrows():
+                records.append({"scaling": "weak", "nodes": int(row["nodes"]), "memory": memory, "rdma": rdma, "phase": "sim", "time_avg": row["time_avg"], "efficiency": row["efficiency"]})
+
+        elif header.startswith("Strong Scaling"):
+            mem_match = re.search(r"Memory: ~(\d+)GB", header)
+            rdma_match = re.search(r"RDMA: (\w)", header)
+            memory = int(mem_match.group(1)) if mem_match else None
+            rdma = rdma_match.group(1) if rdma_match else None
+
+            for _, row in df.iterrows():
+                records.append(
+                    {
+                        "scaling": "strong",
+                        "nodes": int(row["nodes"]),
+                        "memory": memory,
+                        "rdma": rdma,
+                        "phase": "sim",
+                        "time_avg": row["time_avg"],
+                        "speedup": row["speedup"],
+                        "efficiency": row["efficiency"],
+                    }
+                )
+
+        elif header.startswith("Grind Time"):
+            for _, row in df.iterrows():
+                records.append({"scaling": "grind", "memory": int(row["memory"]), "grind_time": row["grind_time"]})
+
+    return pd.DataFrame(records)
+
+
+# Get log files and filter for simulation logs
+files = os.listdir("examples/scaling/logs/")
+files = [f for f in files if "sim" in f]
+
+records = []
+for fname in files:
+    # Remove extension
+    parts = fname.replace(".out", "").split("-")
+    scaling, nodes, memory, rdma, phase = parts
+    records.append({"scaling": scaling, "nodes": int(nodes), "memory": int(memory), "rdma": rdma, "phase": phase, "file": fname})
+
+df = pd.DataFrame(records)
+
+ref_data = parse_reference_file("examples/scaling/reference.dat")
+
+print()
+
+weak_df = df[df["scaling"] == "weak"]
+strong_df = df[df["scaling"] == "strong"]
+grind_df = df[df["scaling"] == "grind"]
+
+weak_ref_df = ref_data[ref_data["scaling"] == "weak"]
+strong_ref_df = ref_data[ref_data["scaling"] == "strong"]
+grind_ref_df = ref_data[ref_data["scaling"] == "grind"]
+
+weak_scaling_mem = weak_df["memory"].unique()
+weak_scaling_rdma = weak_df["rdma"].unique()
+
+for mem in weak_scaling_mem:
+    for rdma in weak_scaling_rdma:
+        subset = weak_df[(weak_df["memory"] == mem) & (weak_df["rdma"] == rdma)]
+        subset = subset.sort_values(by="nodes")
+        ref = weak_ref_df[(weak_ref_df["memory"] == mem) & (weak_ref_df["rdma"] == rdma) & (weak_ref_df["nodes"].isin(subset["nodes"]))]
+        ref = ref.sort_values(by="nodes")
+
+        times = []
+        for _, row in subset.iterrows():
+            time_avg = parse_time_avg(os.path.join("examples/scaling/logs", row["file"]))
+            times.append(time_avg)
+
+        subset = subset.copy()
+        ref = ref.copy()
+        subset["time_avg"] = times
+        base_time = subset.iloc[0]["time_avg"]
+
+        subset["efficiency"] = base_time / subset["time_avg"]
+        subset["rel_perf"] = subset["time_avg"] / ref["time_avg"].values
+        print(f"Weak Scaling - Memory: ~{mem}GB, RDMA: {rdma}")
+        print(subset[["nodes", "time_avg", "efficiency", "rel_perf"]].to_string(index=False))
+        print()
+
+strong_scaling_mem = strong_df["memory"].unique()
+strong_scaling_rdma = strong_df["rdma"].unique()
+
+for mem in strong_scaling_mem:
+    for rdma in strong_scaling_rdma:
+        subset = strong_df[(strong_df["memory"] == mem) & (strong_df["rdma"] == rdma)]
+        subset = subset.sort_values(by="nodes")
+
+        ref = strong_ref_df[(strong_ref_df["memory"] == mem) & (strong_ref_df["rdma"] == rdma) & (strong_ref_df["nodes"].isin(subset["nodes"]))]
+        ref = ref.sort_values(by="nodes")
+
+        times = []
+        for _, row in subset.iterrows():
+            time_avg = parse_time_avg(os.path.join("examples/scaling/logs", row["file"]))
+            times.append(time_avg)
+
+        subset = subset.copy()
+        ref = ref.copy()
+        subset["time_avg"] = times
+        base_time = subset.iloc[0]["time_avg"]
+
+        subset["speedup"] = base_time / subset["time_avg"]
+        subset["efficiency"] = base_time / ((subset["nodes"] / subset.iloc[0]["nodes"]) * subset["time_avg"])
+        subset["rel_perf"] = subset["time_avg"] / ref["time_avg"].values
+        print(f"Strong Scaling - Memory: ~{mem}GB, RDMA: {rdma}")
+        print(subset[["nodes", "time_avg", "speedup", "efficiency", "rel_perf"]].to_string(index=False))
+        print()
+
+if not grind_df.empty:
+    grind_mem = grind_df["memory"].unique()
+    subset = grind_df.sort_values(by="memory")
+    ref = grind_ref_df[(grind_ref_df["memory"].isin(subset["memory"]))]
+    ref = ref.sort_values(by="memory")
+
+    times = []
+    for _, row in subset.iterrows():
+        grind_time = parse_grind_time(os.path.join("examples/scaling/logs", row["file"]))
+        times.append(grind_time)
+
+    subset = subset.copy()
+    ref = ref.copy()
+
+    subset["grind_time"] = times
+    subset["rel_perf"] = subset["grind_time"] / ref["grind_time"].values
+    print(f"Grind Time - Single Device")
+    print(subset[["memory", "grind_time", "rel_perf"]].to_string(index=False))
+
+print()
diff --git a/examples/scaling/build.sh b/examples/scaling/build.sh
old mode 100644
new mode 100755
index 0d7dde559d..d6efa4b67b
--- a/examples/scaling/build.sh
+++ b/examples/scaling/build.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
 
+. ./mfc.sh load -c f -m g
+
 ./mfc.sh build -t pre_process simulation --case-optimization -i examples/scaling/case.py \
-               -j 8 --gpu --mpi --no-debug -- -s strong -m 512
\ No newline at end of file
+               -j 8 --gpu --mpi --no-debug -- -s strong -m 512
diff --git a/examples/scaling/case.py b/examples/scaling/case.py
index 855af434c2..4276b53f38 100644
--- a/examples/scaling/case.py
+++ b/examples/scaling/case.py
@@ -2,11 +2,10 @@
 import sys, json, math, typing, argparse
 
 parser = argparse.ArgumentParser(
-    prog="scaling",
-    description="Weak- and strong-scaling benchmark case.",
+    prog="scaling_and_perf",
+    description="Weak- and strong-scaling and performance benchmark case.",
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 )
-
 parser.add_argument(
     "--mfc",
     type=json.loads,
@@ -29,17 +28,9 @@
     metavar="MEMORY",
     help="Weak scaling: memory per rank in GB. Strong scaling: global memory in GB. Used to determine cell count.",
 )
-parser.add_argument(
-    "-f",
-    "--fidelity",
-    type=str,
-    metavar="FIDELITY",
-    choices=["ideal", "exact"],
-    default="ideal",
-)
-parser.add_argument("--rdma_mpi", type=str, metavar="FIDELITY", choices=["T", "F"], default="F")
-parser.add_argument("--n-steps", type=int, metavar="N", default=None)
-
+parser.add_argument("--rdma_mpi", metavar="RDMA", type=str, choices=["T", "F"], default="F", help="Enable RDMA-aware MPI optimizations.")
+parser.add_argument("--n-steps", metavar="N", type=int, default=20, help="Number of time steps to simulate.")
+parser.add_argument("--n-save", metavar="NS", type=int, default=20, help="Number of time steps between saves.")
 args = parser.parse_args()
 
 if args.scaling is None:
@@ -48,22 +39,60 @@
 
 # approx The number of cells per GB of memory. The exact value is not important.
 cpg = 8000000 / 16.0
+
 # Number of ranks.
 nranks = args.mfc["nodes"] * args.mfc["tasks_per_node"]
 
 
-def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]:
-    s = math.floor((ncells / 2.0) ** (1 / 3))
-    return 2 * s, s, s
+# This subroutine finds three factors of n that are as close to each other as possible.
+def closest_three_factors(n):
+    best_triplet = None
+    min_range = float("inf")
+
+    # Iterate over possible first factor a
+    for factor_one in range(1, int(n ** (1 / 3)) + 2):  # factor_one should be around the cube root of n
+        if n % factor_one == 0:
+            n1 = n // factor_one  # Remaining part
+
+            # Iterate over possible second factor b
+            for factor_two in range(factor_one, int(math.sqrt(n1)) + 2):  # factor_two should be around sqrt of n1
+                if n1 % factor_two == 0:
+                    factor_three = n1 // factor_two  # Third factor
+
+                    triplet_range = factor_three - factor_one  # Spread of the numbers
+                    if triplet_range < min_range:
+                        min_range = triplet_range
+                        best_triplet = (factor_one, factor_two, factor_three)
+
+    return best_triplet
+
+
+def nxyz_from_ncells_weak(ncells: float) -> typing.Tuple[int, int, int]:
+    s = math.floor(ncells ** (1 / 3))
+    ND = closest_three_factors(nranks)
+    if any(N < 4 for N in ND) and nranks > 64:
+        raise RuntimeError(f"Cannot represent {nranks} ranks with at least 4 partitions in each direction.")
+    N1 = ND[0] * s - 1
+    N2 = ND[1] * s - 1
+    N3 = ND[2] * s - 1
+    L1 = ND[0]
+    L2 = ND[1]
+    L3 = ND[2]
+    return N1, N2, N3, L1, L2, L3
+
+
+def nxyz_from_ncells_strong(ncells: float) -> typing.Tuple[int, int, int]:
+    s = round(ncells ** (1 / 3))
+    L1 = 4
+    L2 = 4
+    L3 = 4
+    return s, s, s, L1, L2, L3
 
 
 if args.scaling == "weak":
-    if args.fidelity == "ideal":
-        raise RuntimeError("ask ben")
-    else:
-        Nx, Ny, Nz = nxyz_from_ncells(cpg * nranks * args.memory)
+    Nx, Ny, Nz, Lx, Ly, Lz = nxyz_from_ncells_weak(cpg * args.memory)
 else:
-    Nx, Ny, Nz = nxyz_from_ncells(cpg * args.memory)
+    Nx, Ny, Nz, Lx, Ly, Lz = nxyz_from_ncells_strong(cpg * args.memory)
 
 # Atmospheric pressure - Pa (used as reference value)
 patm = 101325
@@ -71,152 +100,57 @@ def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]:
 # Initial Droplet Diameter / Reference length - m
 D0 = 1.0e-3
 
-# cavity to droplet ratio
-CtD = 0.06
-
-# cavity relative eccentricity (distance between radii)
-ecc = 0.564
-
 # initial shock distance from the y axis. Note that the droplet center is located at y = 0. Thus, the distance from the shock to
 # the droplet is about D0/8
 ISD = 5.0 / 8 * D0
 
 ## pre-shock properties - AIR
-
-# pressure - Pa
-p0a = patm
-
-# density - kg/m3
-rho0a = 1.204
-
-# gamma
-gama = 1.40
-
-# pi infinity - Pa
-pia = 0
-
-# speed of sound - M/s
-c_a = math.sqrt(gama * (p0a + pia) / rho0a)
+p0a = patm  # pressure - Pa
+rho0a = 1.204  # density - kg/m3
+gama = 1.40  # gamma
+pia = 0  # pi infinity - Pa
+c_a = math.sqrt(gama * (p0a + pia) / rho0a)  # speed of sound - M/s
 
 ## Droplet - WATER
+rho0w = 1000  # density - kg/m3
+p0w = patm  # pressure - Pa
+gamw = 6.12  # gamma
+piw = 3.43e08  # pi infty - Pa
+c_w = math.sqrt(gamw * (p0w + piw) / rho0w)  # speed of sound - m/s
 
-# surface tension - N / m
-st = 0.00e0
-
-# Delta Pressure - Pa
-DP = -st * 4 / D0
-
-# initial pressure inside the droplet - Pa
-p0w = p0a - DP
-
-# density - kg/m3
-rho0w = 1000
-
-# gama
-gamw = 6.12
-
-# pi infty - Pa
-piw = 3.43e08
-
-# speed of sound - m/s
-c_w = math.sqrt(gamw * (p0w + piw) / rho0w)
-
-# Shock Mach number of interest. Note that the post-shock properties can be defined in terms of either
-# Min or psOp0a. Just comment/uncomment appropriatelly
+# Shock Mach number of interest
 Min = 2.4
 
-## Pos to pre shock ratios - AIR
-
-# pressure
-psOp0a = (Min**2 - 1) * 2 * gama / (gama + 1) + 1
-# psOp0a = 4.5
-
-# density
-rhosOrho0a = (1 + (gama + 1) / (gama - 1) * psOp0a) / ((gama + 1) / (gama - 1) + psOp0a)
-
-# Mach number of the shocked region - just a checker, as it must return "Min"
-Ms = math.sqrt((gama + 1.0) / (2.0 * gama) * (psOp0a - 1.0) * (p0a / (p0a + pia)) + 1.0)
-
-# shock speed of sound - m/s
-ss = Ms * c_a
-
-## post-shock - AIR
-
-# pressure - Pa
-ps = psOp0a * p0a
-
-# density - kg / m3
-rhos = rhosOrho0a * rho0a
-
-# post shock speed of sound - m/s
-c_s = math.sqrt(gama * (ps + pia) / rhos)
-
-# velocity at the post shock - m/s
-vel = c_a / gama * (psOp0a - 1.0) * p0a / (p0a + pia) / Ms
-
-## Domain boundaries - m
-
-# x direction
-xb = -8.4707 * D0
-xe = 9.6226 * D0
-
-# xb = -10 * D0
-# xe = 10 * D0
-
-# y direction
-yb = 0 * D0
-ye = 10 * D0
-
-# y direction
-zb = 0 * D0
-ze = 10 * D0
-
-# Stretching factor, to make sure the domaing is sufficiently large after the mesh stretch
-StF = 4.0
-
-# grid delta x if mesh were uniform in x direction - m. Note that I do not need a measure for dy
+# Pos to pre shock ratios - AIR
+psOp0a = (Min**2 - 1) * 2 * gama / (gama + 1) + 1  # pressure
+rhosOrho0a = (1 + (gama + 1) / (gama - 1) * psOp0a) / ((gama + 1) / (gama - 1) + psOp0a)  # density
+ss = Min * c_a  # shock speed of sound - m/s
+
+# post-shock conditions - AIR
+ps = psOp0a * p0a  # pressure - Pa
+rhos = rhosOrho0a * rho0a  # density - kg / m3
+c_s = math.sqrt(gama * (ps + pia) / rhos)  # post shock speed of sound - m/s
+vel = c_a / gama * (psOp0a - 1.0) * p0a / (p0a + pia) / Min  # velocity at the post shock - m/s
+
+# Domain extents
+xb = -Lx * D0 / 2
+xe = Lx * D0 / 2
+yb = -Ly * D0 / 2
+ye = Ly * D0 / 2
+zb = -Lz * D0 / 2
+ze = Lz * D0 / 2
+
+# Calculating time step
 dx = (xe - xb) / Nx
-
-# I calculate tend twice; first is an estimate, second is
-# the actual value used. This is because I am getting errors in the
-# post process part every time I approximate the actual Nt by an integer
-# number (think of a smarter way).
-
-# dimensionless time
-ttilde = 1.92
-
-# auxiliary simulation physical time - s. This is not YET the total simulation time, as it will be corrected so as to avoid
-# mismatches in simulation and post_process parts. Note that I wrote it this way so I have better control over the # of autosaves
-tendA = ttilde * D0 / vel
-
-cfl = 0.1
-
-# time-step - s
+cfl = 0.05
 dt = dx * cfl / ss
 
-# Save Frequency. Note that the number of autosaves will be SF + 1, as th IC (0.dat) is also saved
-SF = 400
-
-## making Nt divisible by SF
-# 1 - ensure NtA goes slightly beyond tendA
-NtA = int(tendA // dt + 1)
-
-# Array of saves. It is the same as Nt/Sf = t_step_save
-AS = int(NtA // SF + 1)
-
-# Nt = total number of steps. Note that Nt >= NtA (so at least tendA is completely simulated)
-Nt = args.n_steps or (AS * SF)
-SF = min(SF, Nt)
-
-# total simulation time - s. Note that tend >= tendA
-tend = Nt * dt
-
 # Configuring case args.mfcionary
 print(
     json.dumps(
         {
             # Logistics
-            "run_time_info": "T",
+            "run_time_info": "F",
             "rdma_mpi": args.rdma_mpi,
             # Computational Domain Parameters
             "x_domain%beg": xb,
@@ -231,8 +165,8 @@ def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]:
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": Nt,
-            "t_step_save": SF,
+            "t_step_stop": args.n_steps,
+            "t_step_save": args.n_save,
             # Simulation Algorithm Parameters
             "num_patches": 3,
             "model_eqns": 2,
@@ -241,34 +175,35 @@ def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]:
             "mpp_lim": "T",
             "mixture_err": "T",
             "time_stepper": 3,
-            "weno_order": 3,
+            "weno_order": 5,
             "weno_eps": 1.0e-16,
             "weno_Re_flux": "F",
             "weno_avg": "F",
-            "mapped_weno": "T",
+            "mapped_weno": "F",
             "riemann_solver": 2,
             "wave_speeds": 1,
             "avg_state": 2,
-            "bc_x%beg": -6,
-            "bc_x%end": -6,
-            "bc_y%beg": -2,
+            "bc_x%beg": -3,
+            "bc_x%end": -3,
+            "bc_y%beg": -3,
             "bc_y%end": -3,
-            "bc_z%beg": -2,
+            "bc_z%beg": -3,
             "bc_z%end": -3,
             # Formatted Database Files Structure Parameters
             "format": 1,
             "precision": 2,
             "prim_vars_wrt": "T",
             "parallel_io": "T",
+            "file_per_process": "T",
             # I will use 1 for WATER properties, and 2 for AIR properties
             # Patch 1: Background (AIR - 2)
             "patch_icpp(1)%geometry": 9,
-            "patch_icpp(1)%x_centroid": (xb + xe) / 2 * StF,
-            "patch_icpp(1)%y_centroid": (yb + ye) / 2 * StF,
-            "patch_icpp(1)%z_centroid": (yb + ye) / 2 * StF,
-            "patch_icpp(1)%length_x": (xe - xb) * StF,
-            "patch_icpp(1)%length_y": (ye - yb) * StF,
-            "patch_icpp(1)%length_z": (ze - zb) * StF,
+            "patch_icpp(1)%x_centroid": (xb + xe) / 2,
+            "patch_icpp(1)%y_centroid": (yb + ye) / 2,
+            "patch_icpp(1)%z_centroid": (zb + ze) / 2,
+            "patch_icpp(1)%length_x": (xe - xb),
+            "patch_icpp(1)%length_y": (ye - yb),
+            "patch_icpp(1)%length_z": (ze - zb),
             "patch_icpp(1)%vel(1)": 0.0e00,
             "patch_icpp(1)%vel(2)": 0.0e00,
             "patch_icpp(1)%vel(3)": 0.0e00,
@@ -280,12 +215,12 @@ def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]:
             # Patch 2: Shocked state (AIR - 2)
             "patch_icpp(2)%geometry": 9,
             "patch_icpp(2)%alter_patch(1)": "T",
-            "patch_icpp(2)%x_centroid": -ISD - (xe - xb) / 2 * StF,
-            "patch_icpp(2)%y_centroid": (yb + ye) / 2 * StF,
-            "patch_icpp(2)%z_centroid": (zb + ze) / 2 * StF,
-            "patch_icpp(2)%length_x": (xe - xb) * StF,
-            "patch_icpp(2)%length_y": (ye - yb) * StF,
-            "patch_icpp(2)%length_z": (ze - zb) * StF,
+            "patch_icpp(2)%x_centroid": -ISD - (xe - xb) / 2,
+            "patch_icpp(2)%y_centroid": (yb + ye) / 2,
+            "patch_icpp(2)%z_centroid": (zb + ze) / 2,
+            "patch_icpp(2)%length_x": (xe - xb),
+            "patch_icpp(2)%length_y": (ye - yb),
+            "patch_icpp(2)%length_z": (ze - zb),
             "patch_icpp(2)%vel(1)": vel,
             "patch_icpp(2)%vel(2)": 0.0e00,
             "patch_icpp(2)%vel(3)": 0.0e00,
diff --git a/examples/scaling/reference.dat b/examples/scaling/reference.dat
new file mode 100644
index 0000000000..8f0c7d7118
--- /dev/null
+++ b/examples/scaling/reference.dat
@@ -0,0 +1,36 @@
+
+Weak Scaling - Memory: ~64GB, RDMA: F
+ nodes  time_avg  efficiency
+    16  1.040951    1.000000
+   128  1.047134    0.994095
+  1024  1.063446    0.978847
+  8192  1.068788    0.973955
+
+Weak Scaling - Memory: ~64GB, RDMA: T
+ nodes  time_avg  efficiency
+    16  0.959884    1.000000
+   128  0.962885    0.996884
+  1024  0.965518    0.994165
+  8192  0.988542    0.971010
+
+Strong Scaling - Memory: ~4096GB, RDMA: T
+ nodes  time_avg   speedup  efficiency
+     8  0.955644  1.000000    1.000000
+    64  0.149160  6.406820    0.800852
+   512  0.040367 23.674092    0.369908
+  4096  0.021175 45.130767    0.088146
+
+Strong Scaling - Memory: ~4096GB, RDMA: F
+ nodes  time_avg   speedup  efficiency
+     8  1.034303  1.000000    1.000000
+    64  0.171773  6.021347    0.752668
+   512  0.046694 22.150719    0.346105
+  4096  0.026555 35.987347    0.070288
+
+Grind Time - Single Device
+ memory  grind_time
+      8    1.309068
+     16    1.258899
+     32    1.144731
+     64    1.144664
+
diff --git a/examples/scaling/reference.metadata b/examples/scaling/reference.metadata
new file mode 100644
index 0000000000..04f21c422d
--- /dev/null
+++ b/examples/scaling/reference.metadata
@@ -0,0 +1,18 @@
+The reference data was collected on 9/12/25 using commit
+
+    465819231aa95c16d23f3d7cff1c93c9b17df43e
+
+from
+
+    https://github.com/wilfonba
+
+with the following modules loaded
+
+Currently Loaded Modules:
+  1) craype-x86-trento                5) Core/25.03          9) DefApps            13) cray-libsci/25.03.0  17) perftools-base/25.03.0  21) cray-hdf5/1.12.2.11        25) darshan-runtime/3.4.6-mpi (E4S)
+  2) libfabric/1.22.0                 6) tmux/3.4           10) PrgEnv-cray/8.6.0  14) cray-mpich/8.1.32    18) cpe/25.03               22) cray-python/3.11.7
+  3) craype-network-ofi               7) hsi/default        11) cce/19.0.0         15) cray-pmi/6.1.15      19) rocm/6.3.1              23) craype-accel-amd-gfx90a
+  4) xpmem/2.11.3-1.3_gdbda01a1eb3d   8) lfs-wrapper/0.0.1  12) cray-dsmml/0.3.1   16) craype/2.7.34        20) cray-fftw/3.3.10.9      24) rocprofiler-compute/3.0.0
+
+  Where:
+   E4S:  E4S: Extreme-scale Scientific Software Stack (E4S) https://e4s.io/index.html
diff --git a/examples/scaling/submit.sh b/examples/scaling/submit.sh
deleted file mode 100644
index 1d3ed373f9..0000000000
--- a/examples/scaling/submit.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-mkdir -p examples/scaling/logs
-
-for N in 1 2 4 8 16 32; do
-    echo -n "N=$N: "
-    sbatch <<EOT
-#!/bin/bash
-#SBATCH --job-name MFC-S-$N
-#SBATCH --account=CFD154
-#SBATCH --nodes=$N
-#SBATCH --ntasks-per-node=8
-#SBATCH --time=00:20:00
-#SBATCH --qos=debug
-#SBATCH --output=MFC-S-$N.out
-#SBATCH --cpus-per-task=7
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=closest
-#SBATCH --mail-user="hberre3@gatech.edu"
-#SBATCH --mail-type="BEGIN, END, FAIL"
-
-set -e
-set -x
-
-. ./mfc.sh load -c f -m g
-
-cd "\$SLURM_SUBMIT_DIR"
-
-echo "Hosts"
-
-srun hostname
-
-echo "Start @ $(date)"
-
-for M in 256 384 512; do
-
-    slug=$N-\$M
-    case_dir=examples/scaling/run-\$slug
-    mkdir -p \$case_dir
-    cp examples/scaling/case.py \$case_dir/
-
-    if [ ! -d \$case_dir/restart_data ]; then
-        echo "Running pre_process"
-
-        # Note: `time` is not used for performance measurement, only for monitoring
-        #       the job's progress.
-        time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N --clean     \
-                -t pre_process --no-build -# run-\$slug-pre -- --scaling strong \
-                --memory \$M > examples/scaling/logs/run-\$slug-pre.out 2>&1
-    fi
-
-    for cu_mpi in F T; do
-
-        slug="$N-\$M-\$cu_mpi"
-        echo "Running \$slug"
-
-        # Note: `time` is not used for performance measurement, only for monitoring
-        #       the job's progress.
-        time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N         \
-            -t simulation --case-optimization --no-build -# run-\$slug-sim  \
-            -- --scaling strong --memory \$M --cu_mpi \$cu_mpi --n-steps 20 \
-            > examples/scaling/logs/run-\$slug-sim.out 2>&1
-
-    done
-
-    rm -rf \$case_dir
-
-done
-
-echo "End @ $(date)"
-
-EOT
-done
\ No newline at end of file
diff --git a/examples/scaling/submit_all.sh b/examples/scaling/submit_all.sh
new file mode 100755
index 0000000000..9d50515ea8
--- /dev/null
+++ b/examples/scaling/submit_all.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env sh
+
+# Mandatory argument
+ACCOUNT=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --account)
+            ACCOUNT="$2"
+            shift
+            shift
+            ;;
+    esac
+done
+
+# Check mandatory argument
+if [[ -z "$ACCOUNT" ]]; then
+    echo "Error: --account is required"
+    exit 1
+fi
+
+./examples/scaling/submit_weak.sh --account $ACCOUNT
+./examples/scaling/submit_strong.sh --account $ACCOUNT
+./examples/scaling/submit_grind.sh --account $ACCOUNT
+
diff --git a/examples/scaling/submit_grind.sh b/examples/scaling/submit_grind.sh
new file mode 100755
index 0000000000..9eedd9b595
--- /dev/null
+++ b/examples/scaling/submit_grind.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Initialize default values for optional arguments
+MEM=(8 16 32 64) # Approximate memory per device in GB
+
+# Mandatory argument
+ACCOUNT=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --account)
+            ACCOUNT="$2"
+            shift
+            shift
+            ;;
+        --mem)
+            IFS=',' read -r -a MEM <<< "$2"
+            shift
+            shift
+            ;;
+        *)
+            echo "Unknown option $1"
+            shift
+            ;;
+    esac
+done
+
+# Check mandatory argument
+if [[ -z "$ACCOUNT" ]]; then
+    echo "Error: --account is required"
+    exit 1
+fi
+
+# Print results
+echo "ACCOUNT = $ACCOUNT"
+echo "MEM = ${MEM[@]}"
+
+mkdir -p examples/scaling/logs
+
+for M in "${MEM[@]}"; do
+    echo "M=$M"
+        sbatch <<EOT
+#!/bin/bash
+#SBATCH --job-name MFC-G-$M
+#SBATCH --account=$ACCOUNT
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=00:10:00
+#SBATCH --output=MFC-G-$M.out
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+set -e
+set -x
+
+. ./mfc.sh load -c f -m g
+
+cd "\$SLURM_SUBMIT_DIR"
+
+echo "Hosts"
+
+srun hostname
+
+echo "Start @ $(date)"
+
+slug=1-$M
+case_dir=examples/scaling/grind-\$slug
+mkdir -p \$case_dir
+cp examples/scaling/case.py \$case_dir/
+
+if [ ! -d \$case_dir/restart_data ]; then
+    echo "Running pre_process"
+
+    # Note: `time` is not used for performance measurement, only for monitoring
+    #       the job's progress.
+    time ./mfc.sh run \$case_dir/case.py -c frontier -n 1 -N 1 --clean \
+            -t pre_process -# grind-\$slug-pre -- --scaling weak \
+            --memory $M \
+            > examples/scaling/logs/grind-\$slug-pre.out 2>&1
+fi
+
+slug="1-$M-F"
+echo "Running \$slug"
+
+# Note: `time` is not used for performance measurement, only for monitoring
+#       the job's progress.
+time ./mfc.sh run \$case_dir/case.py -c frontier -n 1 -N 1 -t simulation \
+    --case-optimization -# grind-\$slug-sim -- --scaling weak \
+    --memory $M --n-steps 100 --n-save 20 \
+    > examples/scaling/logs/grind-\$slug-sim.out 2>&1
+
+rm -rf \$case_dir
+
+done
+
+echo "End @ $(date)"
+
+EOT
+done
diff --git a/examples/scaling/submit_strong.sh b/examples/scaling/submit_strong.sh
new file mode 100755
index 0000000000..bb23802346
--- /dev/null
+++ b/examples/scaling/submit_strong.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Initialize default values for optional arguments
+NODES=(8 64 512 4096)
+MEM=(4096) # Approximate total problem size in GB
+
+# Mandatory argument
+ACCOUNT=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --account)
+            ACCOUNT="$2"
+            shift
+            shift
+            ;;
+        --nodes)
+            # Accept a comma-separated list or space-separated
+            IFS=',' read -r -a NODES <<< "$2"
+            shift
+            shift
+            ;;
+        --mem)
+            IFS=',' read -r -a MEM <<< "$2"
+            shift
+            shift
+            ;;
+        *)
+            echo "Unknown option $1"
+            shift
+            ;;
+    esac
+done
+
+# Check mandatory argument
+if [[ -z "$ACCOUNT" ]]; then
+    echo "Error: --account is required"
+    exit 1
+fi
+
+# Print results
+echo "ACCOUNT = $ACCOUNT"
+echo "NODES = ${NODES[@]}"
+echo "MEM = ${MEM[@]}"
+
+mkdir -p examples/scaling/logs
+
+for N in "${NODES[@]}"; do
+    for M in "${MEM[@]}"; do
+        echo -n "N=$N: M=$M"
+        sbatch <<EOT
+#!/bin/bash
+#SBATCH --job-name MFC-S-$N-$M
+#SBATCH --account=$ACCOUNT
+#SBATCH --nodes=$N
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+#SBATCH --output=MFC-S-$N-$M.out
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+set -e
+set -x
+
+. ./mfc.sh load -c f -m g
+
+cd "\$SLURM_SUBMIT_DIR"
+
+echo "Hosts"
+
+srun hostname
+
+echo "Start @ $(date)"
+
+slug=$N-$M
+case_dir=examples/scaling/strong-\$slug
+mkdir -p \$case_dir
+cp examples/scaling/case.py \$case_dir/
+
+if [ ! -d \$case_dir/restart_data ]; then
+    echo "Running pre_process"
+
+    # Note: `time` is not used for performance measurement, only for monitoring
+    #       the job's progress.
+    time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N --clean \
+            -t pre_process -# strong-\$slug-pre -- --scaling strong \
+            --memory $M \
+            > examples/scaling/logs/strong-\$slug-pre.out 2>&1
+fi
+
+for rdma_mpi in F T; do
+
+    slug="$N-$M-\$rdma_mpi"
+    echo "Running \$slug"
+
+    # Note: `time` is not used for performance measurement, only for monitoring
+    #       the job's progress.
+    time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N -t simulation \
+        --case-optimization -# strong-\$slug-sim -- --scaling strong \
+        --memory $M --rdma_mpi \$rdma_mpi --n-steps 20 \
+        > examples/scaling/logs/strong-\$slug-sim.out 2>&1
+
+done
+
+rm -rf \$case_dir
+
+done
+
+echo "End @ $(date)"
+
+EOT
+done
+done
diff --git a/examples/scaling/submit_weak.sh b/examples/scaling/submit_weak.sh
new file mode 100755
index 0000000000..ffe4a89160
--- /dev/null
+++ b/examples/scaling/submit_weak.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Initialize default values for optional arguments
+NODES=(16 128 1024 8192)
+MEM=(64) # Approximate problem size per GCD in GB
+
+# Mandatory argument
+ACCOUNT=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --account)
+            ACCOUNT="$2"
+            shift
+            shift
+            ;;
+        --nodes)
+            # Accept a comma-separated list or space-separated
+            IFS=',' read -r -a NODES <<< "$2"
+            shift
+            shift
+            ;;
+        --mem)
+            IFS=',' read -r -a MEM <<< "$2"
+            shift
+            shift
+            ;;
+        *)
+            echo "Unknown option $1"
+            shift
+            ;;
+    esac
+done
+
+# Check mandatory argument
+if [[ -z "$ACCOUNT" ]]; then
+    echo "Error: --account is required"
+    exit 1
+fi
+
+# Print results
+echo "ACCOUNT = $ACCOUNT"
+echo "NODES = ${NODES[@]}"
+echo "MEM = ${MEM[@]}"
+
+mkdir -p examples/scaling/logs
+
+for N in "${NODES[@]}"; do
+    for M in "${MEM[@]}"; do
+        echo -n "N=$N: M=$M"
+        sbatch <<EOT
+#!/bin/bash
+#SBATCH --job-name MFC-W-$N-$M
+#SBATCH --account=$ACCOUNT
+#SBATCH --nodes=$N
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+#SBATCH --output=MFC-W-$N-$M.out
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+set -e
+set -x
+
+. ./mfc.sh load -c f -m g
+
+cd "\$SLURM_SUBMIT_DIR"
+
+echo "Hosts"
+
+srun hostname
+
+echo "Start @ $(date)"
+
+slug=$N-$M
+case_dir=examples/scaling/weak-\$slug
+mkdir -p \$case_dir
+cp examples/scaling/case.py \$case_dir/
+
+if [ ! -d \$case_dir/restart_data ]; then
+    echo "Running pre_process"
+
+    # Note: `time` is not used for performance measurement, only for monitoring
+    #       the job's progress.
+    time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N --clean \
+            -t pre_process -# weak-\$slug-pre -- --scaling weak \
+            --memory $M \
+            > examples/scaling/logs/weak-\$slug-pre.out 2>&1
+fi
+
+for rdma_mpi in F T; do
+
+    slug="$N-$M-\$rdma_mpi"
+    echo "Running \$slug"
+
+    # Note: `time` is not used for performance measurement, only for monitoring
+    #       the job's progress.
+    time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N -t simulation \
+        --case-optimization -# weak-\$slug-sim -- --scaling weak \
+        --memory $M --rdma_mpi \$rdma_mpi --n-steps 20 \
+        > examples/scaling/logs/weak-\$slug-sim.out 2>&1
+
+done
+
+rm -rf \$case_dir
+
+done
+
+echo "End @ $(date)"
+
+EOT
+done
+done