OFDataCommittee
diff --git a/‎run/meshMotion/wingMotion/mesh_trainer_pinn.py‎
Lines changed: 57 additions & 59 deletions b/‎run/meshMotion/wingMotion/mesh_trainer_pinn.py‎
Lines changed: 57 additions & 59 deletions
@@ -12,6 +12,19 @@
 import matplotlib.pyplot as plt
 
 from sklearn.metrics import mean_squared_error
+
+def annealing_weight(epoch, T_start, T_end, sharpness=3):
+
+    if epoch < T_start:
+        return 0.0
+    elif epoch > T_end:
+        return 1.0
+    else:
+        # set range [0,1]
+        x = (epoch - T_start) / (T_end - T_start)
+
+        return float(1 / (1 + np.exp(-sharpness * (x - 0.5)) * 100))
+    
 class EarlyStopping:
     """Early stopping with absolute threshold and patience-based logic."""
 
@@ -20,6 +33,7 @@ def __init__(
         patience: int = 40,
         min_delta: float = 1.0e-4,
         model: Union[nn.Module, None] = None,
+        max_epochs: int = 10000
     ):
         self._patience = patience
         self._min_delta = min_delta
@@ -29,28 +43,39 @@ def __init__(
         self._stop = False
         self._model_buffer = None
         self._model_script = None
+        self._epoch = 0,
+        self._best_loss_epoch = 0
+        self._max_epochs = max_epochs
+        self._T_start = 0
 
-    def __call__(self, loss: float) -> bool:
+    def __call__(self, loss: float, epoch) -> bool:
         """Check if training should stop."""
-        if loss < self._best_loss * (1.0 - self._min_delta):
-            self._best_loss = loss
-            self._counter = 0
-            if self._model is not None:
-                self.save_model()
-                
-        else:
-            self._counter += 1
-            if self._counter >= self._patience:
-                self._stop = True
+        self._epoch = epoch
+        if self._epoch >= self._max_epochs:
+            self._stop = True
+            print(f"epoch: {self._epoch} reached max epochs.")
+        if self._epoch >= self._T_start:
+            if loss < self._best_loss * (1.0 - self._min_delta):
+                self._best_loss = loss
+                self._counter = 0
+                self._best_loss_epoch = self._epoch
+                if self._model is not None:
+                    self._save_model()     
+            else:
+                self._counter += 1
+                if self._counter > self._patience:
+                    self._stop = True
+
         return self._stop
     def reset(self):
         """Reset the early stopping state."""
         self._model.train()
         self._best_loss = float("inf")
         self._counter = 0
         self._stop = False
+        self._epoch = 0
 
-    def save_model(self):
+    def _save_model(self):
         self._model.eval()
         with io.BytesIO() as buffer:
 
@@ -139,7 +164,7 @@ def train(num_mpi_ranks):
     torch.set_default_dtype(torch.float64)
 
     # Initialize the model
-    model = MLP(num_layers=3, layer_width=50, input_size=2, output_size=2, activation_fn=torch.nn.ReLU()).to(device)
+    model = MLP(num_layers=3, layer_width=50, input_size=2, output_size=2, activation_fn=torch.nn.Tanh()).to(device)
 
     # Initialize the optimizer
     learning_rate = 1e-04
@@ -148,15 +173,20 @@ def train(num_mpi_ranks):
     # # L-BFGS optimizer (currently active)
     # optimizer = optim.LBFGS(model.parameters(), lr=1.0, max_iter=20, tolerance_grad=1e-7, tolerance_change=1e-9, history_size=100)
 
+    epochs = 2000
+     # Annealing schedule parameters
+    T_start = 0
+    T_end = 0.5 * epochs
+
     early_stopper = EarlyStopping(
-        patience=50,
+        patience=100,
         min_delta=1e-3,
-        model=model
+        model=model,
+        max_epochs=epochs
     )
     # Make sure all datasets are avaialble in the smartredis database.
     local_time_index = 1
     while True:    
-        
         print (f"Time step {local_time_index}")
         # Fetch datasets from SmartRedis
 
@@ -228,10 +258,9 @@ def train(num_mpi_ranks):
         loss_func = nn.MSELoss()
 
         model.train()
-        epochs = 5000
         n_epochs = 0
         rmse_loss_val = 1
-        
+
         for epoch in range(epochs):    
             # Zero the gradients
             optimizer.zero_grad()
@@ -245,63 +274,32 @@ def train(num_mpi_ranks):
 
             # Annealed weight: start with high physics weight, gradually decrease
             # Physics weight increase from 0.01 to 0.1 over training
-            physics_weight = max(0.0001, 0.001 * epoch / epochs + 0.0001)
+            physics_weight = annealing_weight(epoch, T_start, T_end, sharpness=10)
             data_weight = 1.0
 
             loss_train = data_weight * data_loss + physics_weight * p_loss
-            print(
-                f"[Epoch {epoch}/{epochs}] "
-                f"data loss: {data_loss.item():.6f}, "
-                f"physics loss: {p_loss.item():.6f}, "
-                f"physics_weight: {physics_weight:.4f}"
-            )
+            if epoch % 50 == 0 or epoch == epochs - 1:
+                print(
+                    f"[Epoch {epoch}/{epochs}] "
+                    f"data loss: {data_loss.item()}, "
+                    f"physics loss: {p_loss.item()}, "
+                    f"physics_weight: {physics_weight}"
+                )
             # Backward pass and optimization
             loss_train.backward()
             optimizer.step()
 
-        # for epoch in range(epochs):
-        #     # Define closure function for L-BFGS
-        #     def closure():
-        #         optimizer.zero_grad()
-                
-        #         # Forward pass on the training data
-        #         displ_pred = model(points_train)
-        
-        #         # Compute loss on the training data with annealed weight
-        #         data_loss = loss_func(displ_pred, displ_train)
-        #         p_loss = pinn_loss(points_train, displ_pred)
-                
-        #         # Annealed weight: start with high physics weight, gradually decrease
-        #         # Physics weight decreases from 1.0 to 0.01 over training
-        #         physics_weight = max(0.01, 1.0 * (1.0 - epoch / epochs))
-        #         data_weight = 1.0
-                
-        #         loss_train = data_weight * data_loss + physics_weight * p_loss
-        #         loss_train.backward()
-        #         return loss_train
-            
-        #     # L-BFGS optimization step
-        #     optimizer.step(closure)
-
             n_epochs = n_epochs + 1
             # Forward pass on the validation data, with torch.no_grad() for efficiency
             with torch.no_grad():
                 displ_pred_val = model(points_val)
                 mse_loss_val = loss_func(displ_pred_val, displ_val)
                 rmse_loss_val = torch.sqrt(mse_loss_val)
-                if early_stopper(rmse_loss_val.item()):
+                if early_stopper(rmse_loss_val.item(), epoch):
                     print(f"Training stopped at epoch {epoch}")
-                    print (f"RMSE {early_stopper._best_loss}, number of epochs {n_epochs}")
+                    print (f"RMSE {early_stopper._best_loss}, the epochs of smallest loss: {early_stopper._best_loss_epoch}")
                     early_stopper.reset()
                     break
-           
-            # if epoch % 1000 == 0 or epoch == epochs - 1:
-            #     print(f"[Epoch {epoch}]")
-            #     print(f"  Data Loss      : {data_loss.item():.6e}")
-            #     print(f"  PINN Loss      : {p_loss.item():.6e}")
-            #     print(f"  Physics Weight : {physics_weight:.4f}")
-            #     print(f"  Data Weight    : {data_weight:.4f}")
-            #     print(f"  Validation RMSE: {rmse_loss_val:.6e}")
 
         # Store the model into SmartRedis
         client.set_model("MLP", early_stopper._model_buffer, "TORCH", "CPU")