optimization

juanitorduz · juanitorduz · commit 3c651a03ee24 · 2025-10-05T19:58:59.000+02:00
diff --git a/pymc_marketing/mmm/mmm.py b/pymc_marketing/mmm/mmm.py
@@ -64,7 +64,6 @@
 
 __all__ = ["MMM", "BaseMMM"]
 
-# Constants
 DEFAULT_HDI_PROB = 0.94
 
 
@@ -345,23 +344,21 @@ def _generate_and_preprocess_model_data(  # type: ignore
                 f"Could not convert {self.date_column} to datetime. Please check the date format."
             ) from e
 
-        channel_data = X[self.channel_columns]
-
         coords: dict[str, Any] = {
             "channel": self.channel_columns,
             "date": date_data,
         }
 
-        new_X_dict = {
-            self.date_column: date_data,
-        }
-        X_data = pd.DataFrame.from_dict(new_X_dict)
-        X_data = pd.concat([X_data, channel_data], axis=1)
-        control_data: pd.DataFrame | pd.Series | None = None
+        # Build X_data efficiently by selecting columns once
+        columns_to_select = [self.date_column, *self.channel_columns]
         if self.control_columns is not None:
-            control_data = X[self.control_columns]
+            columns_to_select.extend(self.control_columns)
             coords["control"] = self.control_columns
-            X_data = pd.concat([X_data, control_data], axis=1)
+
+        # Create X_data with proper date column in one operation
+        # Cast to DataFrame to satisfy mypy type checking
+        X_data = pd.DataFrame(X[columns_to_select])
+        X_data[self.date_column] = date_data
 
         self.model_coords = coords
         if self.validate_data:
@@ -423,11 +420,9 @@ def _compute_scale_for_data(
         else:
             raise ValueError(f"Unknown scaling method: {method}")
 
-        # Avoid division by zero
-        if isinstance(scale, np.ndarray):
-            scale = np.where(scale == 0, 1.0, scale)
-        else:
-            scale = 1.0 if scale == 0 else scale
+        # Avoid division by zero using numpy.maximum for efficiency
+        # This works for both scalars and arrays
+        scale = np.maximum(scale, 1.0)
 
         return scale
 
@@ -438,8 +433,9 @@ def _compute_scales(self) -> None:
         if not isinstance(X_data, pd.DataFrame):
             raise TypeError("X data must be a DataFrame for scaling computation")
 
-        channel_data = np.asarray(X_data[self.channel_columns].values)
-        target_data = np.asarray(self.preprocessed_data["y"]).reshape(-1, 1)
+        # Use pandas/numpy efficient operations - avoid redundant .values call
+        channel_data = X_data[self.channel_columns].to_numpy()
+        target_data = np.atleast_1d(np.asarray(self.preprocessed_data["y"]))
 
         # Compute scales based on scaling configuration
         self.channel_scale = self._compute_scale_for_data(
@@ -1052,15 +1048,19 @@ def attrs_to_init_kwargs(cls, attrs) -> dict[str, Any]:
             The initialization kwargs.
 
         """
+        # Batch load JSON attributes for efficiency
+        model_config = json.loads(attrs["model_config"])
+        adstock_dict = json.loads(attrs["adstock"])
+        saturation_dict = json.loads(attrs["saturation"])
+        scaling_dict = json.loads(attrs.get("scaling", "null"))
+
         return {
-            "model_config": cls._model_config_formatting(
-                json.loads(attrs["model_config"])
-            ),
+            "model_config": cls._model_config_formatting(model_config),
             "date_column": json.loads(attrs["date_column"]),
             "control_columns": json.loads(attrs["control_columns"]),
             "channel_columns": json.loads(attrs["channel_columns"]),
-            "adstock": adstock_from_dict(json.loads(attrs["adstock"])),
-            "saturation": saturation_from_dict(json.loads(attrs["saturation"])),
+            "adstock": adstock_from_dict(adstock_dict),
+            "saturation": saturation_from_dict(saturation_dict),
             "adstock_first": json.loads(attrs.get("adstock_first", "true")),
             "yearly_seasonality": json.loads(attrs["yearly_seasonality"]),
             "time_varying_intercept": json.loads(
@@ -1072,9 +1072,7 @@ def attrs_to_init_kwargs(cls, attrs) -> dict[str, Any]:
             "dag": json.loads(attrs.get("dag", "null")),
             "treatment_nodes": json.loads(attrs.get("treatment_nodes", "null")),
             "outcome_node": json.loads(attrs.get("outcome_node", "null")),
-            "scaling": cls._deserialize_scaling(
-                json.loads(attrs.get("scaling", "null"))
-            ),
+            "scaling": cls._deserialize_scaling(scaling_dict),
         }
 
     def _has_new_scaling(self) -> bool:
@@ -1502,18 +1500,24 @@ def get_channel_contribution_forward_pass_grid(
 
         share_grid = np.linspace(start=start, stop=stop, num=num)
 
+        # Extract and validate X_data once outside the loop
+        X_data = self.preprocessed_data["X"]
+        if not isinstance(X_data, pd.DataFrame):
+            raise TypeError("X data must be a DataFrame")
+
+        base_channel_data = X_data[self.channel_columns].to_numpy()
+
+        # Preallocate list for better performance
         channel_contribution = []
         for delta in share_grid:
-            X_data = self.preprocessed_data["X"]
-            if isinstance(X_data, pd.DataFrame):
-                channel_data = delta * X_data[self.channel_columns].to_numpy()
-            else:
-                raise TypeError("X data must be a DataFrame")
+            # Vectorized scaling - much faster than creating new arrays
+            channel_data = delta * base_channel_data
             channel_contribution_forward_pass = self.channel_contribution_forward_pass(
                 channel_data=channel_data,
                 disable_logger_stdout=True,
             )
             channel_contribution.append(channel_contribution_forward_pass)
+
         return DataArray(
             data=np.array(channel_contribution),
             dims=("delta", "chain", "draw", "date", "channel"),
@@ -1549,9 +1553,11 @@ def plot_channel_parameter(self, param_name: str, **plt_kwargs: Any) -> plt.Figu
         saturation: SaturationTransformation = self.saturation
         adstock: AdstockTransformation = self.adstock
 
-        parameters_to_check = list(saturation.variable_mapping.values()) + list(
-            adstock.variable_mapping.values()
-        )
+        # Use list extension instead of concatenation for better performance
+        parameters_to_check = [
+            *saturation.variable_mapping.values(),
+            *adstock.variable_mapping.values(),
+        ]
         if param_name not in parameters_to_check:
             raise ValueError(
                 f"Invalid parameter name: {param_name}. Choose from {parameters_to_check}"
@@ -1638,17 +1644,17 @@ def _get_intercept_for_plot(
             )
 
         intercept_mean = intercept.mean(["chain", "draw"]).data
+        hdi_result = az.hdi(intercept).intercept.data
 
         if intercept.ndim == 2:
-            # Stationary intercept - repeat for all dates
-            intercept_hdi = np.repeat(
-                a=az.hdi(intercept).intercept.data[None, ...],
-                repeats=self.X[self.date_column].shape[0],
-                axis=0,
+            # Stationary intercept - use broadcasting instead of repeat for efficiency
+            n_dates = self.X[self.date_column].shape[0]
+            intercept_hdi = np.broadcast_to(
+                hdi_result[None, :], (n_dates, hdi_result.shape[0])
             )
         else:
             # Time-varying intercept
-            intercept_hdi = az.hdi(intercept).intercept.data
+            intercept_hdi = hdi_result
 
         return intercept_mean, intercept_hdi
 
@@ -1750,11 +1756,14 @@ def plot_components_contributions(
         intercept_mean, intercept_hdi = self._get_intercept_for_plot(original_scale)
         color_idx = len(means)
 
-        ax.plot(
-            dates,
-            np.full(len(dates), intercept_mean),
-            color=f"C{color_idx}",
-        )
+        # Use scalar intercept if possible, otherwise array
+        if np.ndim(intercept_mean) == 0:
+            # Scalar intercept - matplotlib handles broadcasting automatically
+            ax.axhline(y=intercept_mean, color=f"C{color_idx}")
+        else:
+            # Time-varying intercept
+            ax.plot(dates, intercept_mean, color=f"C{color_idx}")
+
         ax.fill_between(
             x=dates,
             y1=intercept_hdi[:, 0],
@@ -2139,34 +2148,37 @@ def format_recovered_transformation_parameters(
         # Retrieve channel names
         channels = self.fit_result.channel.values
 
-        # Initialize the dictionary to store channel information
-        channels_info = {}
-
         # Define the parameter groups for consolidation
         param_groups = {
             "saturation_params": self.saturation.model_config.keys(),
             "adstock_params": self.adstock.model_config.keys(),
         }
 
-        # Iterate through each channel to fetch and store parameters
+        # Pre-compute quantiles for all parameters at once (more efficient)
+        quantile_cache = {}
+        for group_name, params in param_groups.items():
+            prefix = group_name.split("_")[0] + "_"
+            for param in params:
+                if param in self.fit_result:
+                    # Compute quantile once and convert to pandas
+                    quantile_cache[param] = (
+                        self.fit_result[param]
+                        .quantile(quantile, dim=["chain", "draw"])
+                        .to_pandas()
+                    )
+
+        # Build channel info dictionary efficiently
+        channels_info = {}
         for channel in channels:
             channel_info = {}
-
-            # Process each group of parameters (saturation and adstock)
             for group_name, params in param_groups.items():
-                # Build dictionary for the current group of parameters
+                prefix = group_name.split("_")[0] + "_"
                 param_dict = {
-                    param.replace(group_name.split("_")[0] + "_", ""): self.fit_result[
-                        param
-                    ]
-                    .quantile(quantile, dim=["chain", "draw"])
-                    .to_pandas()
-                    .to_dict()[channel]
+                    param.replace(prefix, ""): quantile_cache[param].to_dict()[channel]
                     for param in params
-                    if param in self.fit_result
+                    if param in quantile_cache
                 }
                 channel_info[group_name] = param_dict
-
             channels_info[channel] = channel_info
 
         return channels_info
@@ -2720,16 +2732,22 @@ def _generate_future_dates(
         list[pd.Timestamp]
             List of future dates
         """
-        offset_map = {
-            "daily": lambda i: pd.DateOffset(days=i),
-            "weekly": lambda i: pd.DateOffset(weeks=i),
-            "monthly": lambda i: pd.DateOffset(months=i),
-            "quarterly": lambda i: pd.DateOffset(months=3 * i),
-            "yearly": lambda i: pd.DateOffset(years=i),
-        }
-
-        offset_func = offset_map[time_granularity]
-        return [last_date + offset_func(i) for i in range(1, time_length + 1)]
+        # Use pandas date_range for efficient date generation
+        if time_granularity == "daily":
+            freq = "D"
+        elif time_granularity == "weekly":
+            freq = "W"
+        elif time_granularity == "monthly":
+            freq = "MS"  # Month start
+        elif time_granularity == "quarterly":
+            freq = "QS"  # Quarter start
+        else:  # yearly
+            freq = "YS"  # Year start
+
+        # Generate dates efficiently using pandas
+        return pd.date_range(start=last_date, periods=time_length + 1, freq=freq)[
+            1:
+        ].tolist()
 
     def _create_synth_dataset(
         self,
@@ -2805,24 +2823,27 @@ def _create_synth_dataset(
             last_date, time_granularity, time_length
         )
 
-        # Create synthetic rows
-        new_rows = [
-            {
-                self.date_column: pd.to_datetime(new_date),
-                **{
-                    channel: allocation_strategy.sel(channel=channel).values
-                    + np.random.normal(
-                        0, noise_level * allocation_strategy.sel(channel=channel).values
-                    )
-                    for channel in channels
-                },
-                **{control: 0 for control in _controls},
-                target_col: 0,
-            }
-            for new_date in new_dates
-        ]
+        # Vectorized creation of synthetic dataset
+        # Extract allocation values once
+        channel_allocations = allocation_strategy.to_pandas()
+
+        # Create noise matrix efficiently
+        noise = np.random.normal(0, noise_level, size=(time_length, len(channels)))
+        channel_values = channel_allocations.values * (1 + noise)
+
+        # Build DataFrame efficiently using dict of arrays
+        data_dict: dict[str, Any] = {self.date_column: new_dates}
+        data_dict.update(dict(zip(channels, channel_values.T, strict=False)))
+
+        # Add controls efficiently if present (as arrays for proper type consistency)
+        if _controls:
+            zeros_array = np.zeros(time_length)
+            for control in _controls:
+                data_dict[control] = zeros_array
+
+        data_dict[target_col] = np.zeros(time_length)
 
-        return pd.DataFrame(new_rows)
+        return pd.DataFrame(data_dict)
 
     def sample_response_distribution(
         self,