Continue to modularize trainer

ashao · ashao · commit 92953a73e0fa · 2025-10-08T00:10:29.000Z
Break the definition and particulars of model training out of the
training script into their own files. To include a new model,
users should add another file to networks directory and create
two new classes: the model class which defines the architecture
and the trainer class which controls how the model is trained
diff --git a/run/meshMotion/.gitignore b/run/meshMotion/.gitignore
@@ -2,3 +2,4 @@
 *.stl 
 *.csv
 *.pdf
+ellipsoid3D_MachineLearningMeshMotion/*
diff --git a/run/meshMotion/ml_model_training.py b/run/meshMotion/ml_model_training.py
@@ -1,65 +1,41 @@
 import argparse
-from smartredis import Client
 import torch
-import torch.nn as nn
 import numpy as np
 import io
-from sklearn.model_selection import train_test_split
 import torch.optim as optim
-import time
-from typing import Tuple, Union
-from matplotlib import pyplot as plt
-
-from sklearn.metrics import mean_squared_error
-
-class MLP(nn.Module):
-    def __init__(self, num_layers, layer_width, input_size, output_size, activation_fn):
-        super(MLP, self).__init__()
-
-        layers = []
-        layers.append(nn.Linear(input_size, layer_width))
-        layers.append(activation_fn)
-
-        for _ in range(num_layers - 2):
-            layers.append(nn.Linear(layer_width, layer_width))
-            layers.append(activation_fn)
 
-        layers.append(nn.Linear(layer_width, output_size))
-        self.layers = nn.Sequential(*layers)
-
-    def forward(self, x):
-        return self.layers(x)
-
-def loss_weighted_center(y_true, y_pred, weights, weights_power):
-    weights_normed = torch.pow(weights, weights_power)
-    weights_normed = weights_normed/torch.sum(weights_normed)
+from matplotlib import pyplot as plt
+from smartredis import Client
 
-    return torch.sum(torch.sum((y_true-y_pred)**2, dim=1)*weights_normed)
+from MLP import MLP, MLPTrainer
 
 
 def train(args):
     client = Client()
     torch.set_default_dtype(torch.float64)
 
     # Read the solution direction from a database
-    dimension = int(client.get_tensor("solution_dim"))
+    dimension = int(client.get_tensor("solution_dim")[0])
 
     print (f"Solution dimension = {dimension}.")
-
     # Initialize the model
-    model = MLP(
-        num_layers=3,
-        layer_width=10,
-        input_size=dimension,
-        output_size=dimension,
-        activation_fn=torch.nn.ELU()
-    )
-
-    # Initialize the optimizer
-    learning_rate = 1e-3
-    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+    if args.model_name == "mlp":
+        model = MLP(
+            input_size=dimension,
+            output_size=dimension,
+            num_layers=3,
+            layer_width=10,
+            activation_fn=torch.nn.ELU()
+        )
+        trainer = MLPTrainer(model, args.radius_power)
 
+    data_ready = client.poll_key("points", 1, 10000)
+    points = client.get_tensor("points")
+    interior_points = np.vstack([client.get_tensor(f"points_MPI_{i}" for i in range(4))])
+    X = torch.from_numpy(points).to(torch.float64)
     # Make sure all datasets are avaialble in the smartredis database.
+
+    epochs = 5000
     iteration = 1
     while True:
 
@@ -69,47 +45,22 @@ def train(args):
         if (not data_ready):
             raise RuntimeError("Data not found in SmartRedis; aborting training.")
 
-        points = client.get_tensor("points")
         displacements = client.get_tensor("displacements")
-
+        interior_points = client.get_tensor
         client.delete_tensor("data_ready")
 
-        X = torch.from_numpy(points).to(torch.float64)
         y = torch.from_numpy(displacements).to(torch.float64)
 
-        # Find the center of the shape as the average of all the points on the inner boundary
-        r = torch.sqrt(torch.sum(X**2, dim=1))
-        inner = r < 5
-        center = torch.mean(X[inner], dim=0)
-
-        dist = torch.sqrt(torch.sum((X-center)**2, dim=1))
-        wts = dist/torch.sum(dist)
 
         validation_rmse = []
-        model.train()
-        epochs = 5000
         n_epochs = 0
 
         for epoch in range(epochs):
-            # Zero the gradients
-            optimizer.zero_grad()
-
-            # Forward pass on the training data
-            displ_pred = model(X)
-
-            # Compute loss on the training data
-            loss_train = loss_weighted_center(displ_pred, y, wts, args.radius_power)
-
-            if (loss_train < 5e-05):
+            loss, model = trainer.training_step(X, y)
+            if trainer.converged():
                 break
 
-            # Backward pass and optimization
-            loss_train.backward()
-            optimizer.step()
-
-            n_epochs = n_epochs + 1
-
-        print (f"MSE {loss_train.item()}, number of epochs {n_epochs}", flush=True)
+        print(f"MSE {loss.item()}, number of epochs {epoch}", flush=True)
         np.savez(
             f"data_{iteration:02d}.npz",
             points=points,
@@ -150,6 +101,11 @@ def train(args):
     parser = argparse.ArgumentParser(description="Training script for mesh motion")
     parser.add_argument("mpi_ranks", help="number of mpi ranks", type=int)
     parser.add_argument("radius_power", help="power law to weight losses", type=float)
+    parser.add_argument("model_name",
+                        help="which model to use to calculate interior displacements",
+                        choices=["mlp"],
+                        type=str
+    )
     args = parser.parse_args()
 
     train(args)
diff --git a/run/meshMotion/networks/MLP.py b/run/meshMotion/networks/MLP.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+class MLP(nn.Module):
+  def __init__(self, input_size, output_size, activation_fn, num_layers, layer_width):
+      super(MLP, self).__init__()
+
+      layers = []
+      layers.append(nn.Linear(input_size, layer_width))
+      layers.append(activation_fn)
+
+      for _ in range(num_layers - 2):
+          layers.append(nn.Linear(layer_width, layer_width))
+          layers.append(activation_fn)
+
+      layers.append(nn.Linear(layer_width, output_size))
+      self.layers = nn.Sequential(*layers)
+
+  def forward(self, x):
+      return self.layers(x)
+
+class MLPTrainer:
+  def __init__(self, model, radius_power, lr=1e-3, loss_stop=5e-5):
+    self.model = model
+    self.optimizer = optim.Adam(model.parameters(), lr=lr)
+    self.loss_stop = loss_stop
+    self.loss_value = None
+    self.radius_power = radius_power
+
+  def loss(self, X, y_true):
+    inner = y_true != 0.
+    center = torch.mean(X[inner], dim=0)
+    scaled_dist = torch.sqrt(torch.sum((X-center)**2, dim=1))**self.radius_power
+    wts = scaled_dist/torch.sum(scaled_dist)
+
+    y_pred = self.model(X)
+    return torch.sum(wts*torch.sum(torch.sqrt((y_true-y_pred)**2), dim=1))
+
+  def training_step(self, X, y_true):
+     self.optimizer.zero_grad()
+     loss_value = self.loss(X, y_true)
+     self.loss_value = loss_value
+     loss_value.backward()
+     self.optimizer.step()
+
+     return loss_value, self.model
+
+  def converged(self):
+     if self.loss_value.item() < self.loss_stop:
+        return True
+     return False
+
+
+
+
diff --git a/run/meshMotion/requirements.txt b/run/meshMotion/requirements.txt
@@ -0,0 +1,2 @@
+gmsh
+PyFoam
diff --git a/run/meshMotion/smartsim_driver.py b/run/meshMotion/smartsim_driver.py
@@ -57,10 +57,10 @@ def main(args):
     openfoam_rs = exp.create_run_settings(
         exe="moveDynamicMesh",
         exe_args="-parallel",
+        run_command="mpirun"
     )
     openfoam_rs.set_tasks(num_mpi_ranks)
     openfoam_rs.set_nodes(1)
-    openfoam_rs.set("exclusive")
 
     # Create the model from the OpenFOAM case argument
     openfoam_model = exp.create_model(
@@ -75,7 +75,7 @@ def main(args):
 
     training_rs = exp.create_run_settings(
         exe="python",
-        exe_args=f"ml_model_training.py {num_mpi_ranks} {args.radius_power}"
+        exe_args=f"ml_model_training.py {num_mpi_ranks} {args.radius_power} mlp"
     )
     training_rs.set_tasks(1)
     training_rs.set_nodes(1)
@@ -84,7 +84,9 @@ def main(args):
         name="ml_model_training",
         run_settings=training_rs
     )
-    ml_model_training.attach_generator_files(to_copy="ml_model_training.py")
+    ml_model_training.attach_generator_files(
+        to_copy=["ml_model_training.py", "networks/MLP.py"]
+    )
 
     exp.generate(ml_model_training, overwrite=True)
 

-Original file line number
+Diff line change
 *.stl
 *.csv
 *.pdf
 +ellipsoid3D_MachineLearningMeshMotion/*