Update training and inference configurations

htahir1 · htahir1 · commit 32ba9678e845 · 2025-08-19T21:57:35.000+02:00
diff --git a/floracast/configs/training.yaml b/floracast/configs/training.yaml
@@ -41,20 +41,22 @@ steps:
   
   train_model:
     parameters:
-      input_chunk_length: 90
-      output_chunk_length: 14
-      hidden_size: 256
-      lstm_layers: 2
-      num_attention_heads: 8
-      dropout: 0.15
-      batch_size: 16
-      n_epochs: 100 
+      input_chunk_length: 14   # Longer input for better pattern recognition
+      output_chunk_length: 14  # 2-week forecasting horizon for impressive demo  
+      hidden_size: 16          # Smaller hidden size to prevent instability
+      lstm_layers: 1           # Single layer
+      num_attention_heads: 1   # Single head to prevent complexity issues
+      dropout: 0.0             # No dropout to eliminate regularization issues
+      batch_size: 4            # Small batch size that works with data
+      n_epochs: 5              # Few epochs
       random_state: 42
-      add_relative_index: true
+      add_relative_index: true   # Required for TFT - generates future covariates
       enable_progress_bar: true
       enable_model_summary: true
+      learning_rate: 0.001     # Standard learning rate that works
+      weight_decay: 0.0        # No weight decay to eliminate regularization issues
   
   evaluate:
     parameters:
-      horizon: 14
+      horizon: 14  # Match updated output_chunk_length - 2 weeks
       metric: "smape"
diff --git a/floracast/materializers/tft_materializer.py b/floracast/materializers/tft_materializer.py
@@ -32,22 +32,30 @@ def load(self, data_type: Type[Any]) -> Any:
         """Load a TFT model using enhanced reconstruction strategy."""
         # using top-level TFTModel import
 
-        # Check what save strategies were used
-        strategy_info = self._load_strategy_info()
+        # Set PyTorch default dtype to float32 for consistent precision
+        original_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch.float32)
 
-        # Try enhanced reconstruction if PyTorch state was saved
-        if strategy_info.get("pytorch_model_saved", False):
+        try:
+            # Check what save strategies were used
+            strategy_info = self._load_strategy_info()
+
+            # Try enhanced reconstruction if PyTorch state was saved
+            if strategy_info.get("pytorch_model_saved", False):
+                try:
+                    return self._load_with_pytorch_state()
+                except Exception as e:
+                    logger.warning(f"Enhanced reconstruction failed: {e}")
+
+            # Fallback to pickle loading
             try:
-                return self._load_with_pytorch_state()
+                return self._load_pickle_format()
             except Exception as e:
-                logger.warning(f"Enhanced reconstruction failed: {e}")
-
-        # Fallback to pickle loading
-        try:
-            return self._load_pickle_format()
-        except Exception as e:
-            logger.error(f"All loading strategies failed: {e}")
-            raise
+                logger.error(f"All loading strategies failed: {e}")
+                raise
+        finally:
+            # Restore original PyTorch dtype
+            torch.set_default_dtype(original_dtype)
 
     def _load_native_format(self) -> Any:
         """Load TFT model using native Darts save format."""
@@ -206,10 +214,10 @@ def _load_pickle_format(self) -> Any:
         with fileio.open(pickle_path, "rb") as f:
             model = pickle.load(f)
 
-        logger.warning(
-            "Loaded from pickle - internal PyTorch model may be None"
-        )
-        return model
+            logger.warning(
+                "Loaded from pickle - internal PyTorch model may be None"
+            )
+            return model
 
     def save(self, data: Any) -> None:
         """Save TFT model using enhanced strategy that preserves internal PyTorch model."""
diff --git a/floracast/materializers/timeseries_materializer.py b/floracast/materializers/timeseries_materializer.py
@@ -13,6 +13,7 @@
 from typing import Any, Dict, Type
 
 import pandas as pd
+import numpy as np
 import matplotlib
 
 # Use a non-interactive backend for headless environments
@@ -102,6 +103,12 @@ def load(self, data_type: Type[Any]) -> Any:
             df, time_col=time_col, value_cols=value_cols, freq=freq
         )
 
+        # Convert to float32 for hardware compatibility (MPS, mixed precision training)
+        logger.debug(
+            "Converting TimeSeries to float32 for hardware compatibility"
+        )
+        ts = ts.astype(np.float32)
+
         # Restore static covariates if present
         if fileio.exists(static_covariates_path):
             with fileio.open(static_covariates_path, "r") as f:
diff --git a/floracast/pipelines/batch_inference_pipeline.py b/floracast/pipelines/batch_inference_pipeline.py
@@ -16,12 +16,8 @@ def batch_inference_pipeline() -> None:
     """
     Batch inference pipeline that loads model from Model Control Plane and generates predictions.
     """
-    logger.info("Starting FloraCast batch inference pipeline")
-
     # Step 1: Ingest data (simulate real-time data sources)
     raw_data = ingest_data(infer=True)
 
     # Step 2: Generate predictions using model from MCP (with scaling handled internally)
     batch_inference_predict(df=raw_data)
-
-    logger.info("Batch inference completed. Returning predictions DataFrame.")
diff --git a/floracast/steps/batch_infer.py b/floracast/steps/batch_infer.py
@@ -2,26 +2,34 @@
 Batch inference step for FloraCast using ZenML Model Control Plane.
 """
 
-from typing import Annotated
+from typing import Annotated, Tuple
 import pandas as pd
 import numpy as np
 from darts import TimeSeries
 from zenml import step, get_step_context, log_metadata
 from zenml.logger import get_logger
 from zenml.client import Client
 from utils.prediction import iterative_predict
+from materializers.timeseries_materializer import DartsTimeSeriesMaterializer
 
 logger = get_logger(__name__)
 
 
-@step
+@step(
+    output_materializers={
+        "prediction_series": DartsTimeSeriesMaterializer,
+    }
+)
 def batch_inference_predict(
     df: pd.DataFrame,
     datetime_col: str = "ds",
     target_col: str = "y",
     freq: str = "D",
     horizon: int = 14,
-) -> Annotated[pd.DataFrame, "predictions"]:
+) -> Tuple[
+    Annotated[pd.DataFrame, "predictions"],
+    Annotated[TimeSeries, "prediction_series"],
+]:
     """
     Perform batch inference using the trained model from Model Control Plane.
 
@@ -34,19 +42,16 @@ def batch_inference_predict(
 
     Returns:
         DataFrame containing forecast results with columns ['ds', 'yhat']
+        TimeSeries containing the forecast results
     """
     logger.info(f"Performing batch inference with horizon: {horizon}")
 
     try:
-        # Convert DataFrame to TimeSeries
+        # Convert DataFrame to TimeSeries and cast to float32 for consistency
         logger.info("Converting DataFrame to TimeSeries")
         series = TimeSeries.from_dataframe(
             df, time_col=datetime_col, value_cols=target_col, freq=freq
-        )
-
-        # Cast to float32 for consistency with training data
-        logger.info("Converting TimeSeries to float32 for consistency")
-        series = series.astype(np.float32)
+        ).astype(np.float32)
 
         logger.info(f"Created TimeSeries with {len(series)} points")
         logger.info(
@@ -167,7 +172,7 @@ def batch_inference_predict(
             }
         )
 
-        return pred_df
+        return pred_df, predictions
 
     except Exception as e:
         logger.error(f"Batch inference failed: {str(e)}")
diff --git a/floracast/steps/evaluate.py b/floracast/steps/evaluate.py
@@ -236,7 +236,7 @@ def create_evaluation_visualization(
         return HTMLString(error_html)
 
 
-@step
+@step(enable_cache=False)
 def evaluate(
     model: object,
     train_series: TimeSeries,
diff --git a/floracast/steps/preprocess.py b/floracast/steps/preprocess.py
@@ -75,6 +75,70 @@ def preprocess_data(
     train_series = train_series.astype(np.float32)
     val_series = val_series.astype(np.float32)
 
+    # Check for NaN/inf values in scaled data
+    train_values = train_series.pd_dataframe().values
+    val_values = val_series.pd_dataframe().values
+
+    train_nan_count = np.isnan(train_values).sum()
+    train_inf_count = np.isinf(train_values).sum()
+    val_nan_count = np.isnan(val_values).sum()
+    val_inf_count = np.isinf(val_values).sum()
+
+    logger.info(
+        f"Data quality check - Train NaN: {train_nan_count}, Train Inf: {train_inf_count}"
+    )
+    logger.info(
+        f"Data quality check - Val NaN: {val_nan_count}, Val Inf: {val_inf_count}"
+    )
+
+    # Check for extreme values that could cause numerical instability
+    train_min, train_max = train_values.min(), train_values.max()
+    val_min, val_max = val_values.min(), val_values.max()
+    logger.info(
+        f"Value ranges - Train: [{train_min:.6f}, {train_max:.6f}], Val: [{val_min:.6f}, {val_max:.6f}]"
+    )
+
+    # Flag potentially problematic values
+    needs_cleaning = (
+        train_nan_count > 0
+        or train_inf_count > 0
+        or val_nan_count > 0
+        or val_inf_count > 0
+        or abs(train_min) > 1e6
+        or abs(train_max) > 1e6
+        or abs(val_min) > 1e6
+        or abs(val_max) > 1e6
+    )
+
+    if needs_cleaning:
+        logger.warning(
+            "Found potentially problematic values in scaled data - cleaning..."
+        )
+
+        # Replace NaN/Inf and clip extreme values
+        train_df = train_series.pd_dataframe()
+        val_df = val_series.pd_dataframe()
+
+        # Handle NaN/Inf
+        train_df = train_df.replace([np.inf, -np.inf], np.nan)
+        val_df = val_df.replace([np.inf, -np.inf], np.nan)
+
+        train_df = train_df.fillna(0.0)
+        val_df = val_df.fillna(0.0)
+
+        # Clip extreme values to reasonable range
+        train_df = train_df.clip(-10.0, 10.0)
+        val_df = val_df.clip(-10.0, 10.0)
+
+        train_series = TimeSeries.from_dataframe(train_df).astype(np.float32)
+        val_series = TimeSeries.from_dataframe(val_df).astype(np.float32)
+
+        logger.info(
+            "Cleaned data - replaced NaN/Inf and clipped to [-10, 10] range"
+        )
+    else:
+        logger.info("Data quality check passed - no problematic values found")
+
     # Return fitted scaler as artifact for inference use
     logger.info("Returning fitted scaler as artifact for inference use")
 
diff --git a/floracast/steps/train.py b/floracast/steps/train.py
@@ -5,7 +5,7 @@
 from typing import Annotated
 import torch
 from darts import TimeSeries
-from darts.models import TFTModel
+from darts.models import TFTModel, RNNModel
 from zenml import step
 from zenml.logger import get_logger
 from materializers.tft_materializer import (
@@ -30,6 +30,8 @@ def train_model(
     add_relative_index: bool = True,
     enable_progress_bar: bool = False,
     enable_model_summary: bool = False,
+    learning_rate: float = 1e-3,
+    weight_decay: float = 1e-5,
 ) -> Annotated[TFTModel, "trained_model"]:
     """Train a TFT forecasting model.
 
@@ -47,6 +49,8 @@ def train_model(
         add_relative_index: Whether to add relative index
         enable_progress_bar: Whether to show progress bar
         enable_model_summary: Whether to show model summary
+        learning_rate: Learning rate for optimizer
+        weight_decay: Weight decay for regularization
 
     Returns:
         Trained TFT model
@@ -63,17 +67,46 @@ def train_model(
         "n_epochs": n_epochs,
         "random_state": random_state,
         "add_relative_index": add_relative_index,
+        "optimizer_kwargs": {
+            "lr": learning_rate,
+            "weight_decay": weight_decay,
+        },
         "pl_trainer_kwargs": {
             "enable_progress_bar": enable_progress_bar,
             "enable_model_summary": enable_model_summary,
             "precision": "32-true",  # Use 32-bit precision for better hardware compatibility
+            "gradient_clip_val": 1.0,  # Standard gradient clipping
+            "gradient_clip_algorithm": "norm",  # Clip by norm
+            "detect_anomaly": True,  # Detect NaN/inf in loss
+            "max_epochs": n_epochs,
+            "check_val_every_n_epoch": 1,  # Validate every epoch
+            "accelerator": "cpu",  # Force CPU to avoid MPS issues
         },
     }
 
     logger.info(f"Training TFT model with params: {model_params}")
 
     # Initialize TFT model
     model = TFTModel(**model_params)
+
+    # Initialize model weights with Xavier/Glorot initialization for stability
+    def init_weights(m):
+        if isinstance(m, torch.nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                torch.nn.init.zeros_(m.bias)
+        elif isinstance(m, torch.nn.LSTM):
+            for name, param in m.named_parameters():
+                if "weight" in name:
+                    torch.nn.init.xavier_uniform_(param)
+                elif "bias" in name:
+                    torch.nn.init.zeros_(param)
+
+    # Apply weight initialization
+    if hasattr(model, "model") and model.model is not None:
+        model.model.apply(init_weights)
+        logger.info("Applied Xavier weight initialization to model")
+
     logger.info(f"Starting TFT training with {len(train_series)} data points")
 
     # Train the TFT model