Changed dist_select plotting

Alexander März · Alexander März · commit fe6a5888af9e · 2023-08-28T15:49:35.000+02:00
diff --git a/lightgbmlss/distributions/distribution_utils.py b/lightgbmlss/distributions/distribution_utils.py
@@ -9,7 +9,8 @@
 from tqdm import tqdm
 
 from typing import Any, Dict, Optional, List, Tuple
-from plotnine import *
+import matplotlib.pyplot as plt
+import seaborn as sns
 import warnings
 
 
@@ -598,7 +599,6 @@ def dist_select(self,
                     target: np.ndarray,
                     candidate_distributions: List,
                     max_iter: int = 100,
-                    n_samples: int = 1000,
                     plot: bool = False,
                     figure_size: tuple = (10, 5),
                     ) -> pd.DataFrame:
@@ -614,8 +614,6 @@ def dist_select(self,
             List of candidate distributions.
         max_iter: int
             Maximum number of iterations for the optimization.
-        n_samples: int
-            Number of samples to draw from the fitted distribution.
         plot: bool
             If True, a density plot of the actual and fitted distribution is created.
         figure_size: tuple
@@ -650,11 +648,11 @@ def dist_select(self,
                          }
                     )
                 dist_list.append(fit_df)
-                fit_df = pd.concat(dist_list).sort_values(by=self.loss_fn, ascending=True)
-                fit_df["rank"] = fit_df[self.loss_fn].rank().astype(int)
-                fit_df.set_index(fit_df["rank"], inplace=True)
                 pbar.update(1)
             pbar.set_description(f"Fitting of candidate distributions completed")
+            fit_df = pd.concat(dist_list).sort_values(by=self.loss_fn, ascending=True)
+            fit_df["rank"] = fit_df[self.loss_fn].rank().astype(int)
+            fit_df.set_index(fit_df["rank"], inplace=True)
 
         if plot:
             # Select best distribution
@@ -675,29 +673,19 @@ def dist_select(self,
                 axis=1,
             )
             fitted_params = pd.DataFrame(fitted_params, columns=best_dist_sel.param_dict.keys())
-            fitted_params.columns = best_dist_sel.param_dict.keys()
+            n_samples = np.max([10000, target.shape[0]])
+            n_samples = np.where(n_samples > 500000, 100000, n_samples)
             dist_samples = best_dist_sel.draw_samples(fitted_params,
                                                       n_samples=n_samples,
                                                       seed=123).values
 
             # Plot actual and fitted distribution
-            plot_df_actual = pd.DataFrame({"y": target.reshape(-1,), "type": "Actual"})
-            plot_df_fitted = pd.DataFrame({"y": dist_samples.reshape(-1,),
-                                           "type": f"Best-Fit: {best_dist['distribution'].values[0]}"})
-            plot_df = pd.concat([plot_df_actual, plot_df_fitted])
-
-            print(
-                ggplot(plot_df,
-                       aes(x="y",
-                           color="type")) +
-                geom_density(alpha=0.5) +
-                theme_bw(base_size=15) +
-                theme(figure_size=figure_size,
-                      legend_position="right",
-                      legend_title=element_blank(),
-                      plot_title=element_text(hjust=0.5)) +
-                labs(title=f"Actual vs. Fitted Density")
-            )
+            plt.figure(figsize=figure_size)
+            sns.kdeplot(target.reshape(-1, ), label="Actual")
+            sns.kdeplot(dist_samples.reshape(-1, ), label=f"Best-Fit: {best_dist['distribution'].values[0]}")
+            plt.legend()
+            plt.title("Actual vs. Best-Fit Density", fontweight="bold", fontsize=16)
+            plt.show()
 
         fit_df.drop(columns=["rank", "params"], inplace=True)
 
diff --git a/lightgbmlss/distributions/flow_utils.py b/lightgbmlss/distributions/flow_utils.py
@@ -11,7 +11,8 @@
 from tqdm import tqdm
 
 from typing import Any, Dict, Optional, List, Tuple
-from plotnine import * 
+import matplotlib.pyplot as plt
+import seaborn as sns
 import warnings
 
 
@@ -637,7 +638,6 @@ def flow_select(self,
                     target: np.ndarray,
                     candidate_flows: List,
                     max_iter: int = 100,
-                    n_samples: int = 1000,
                     plot: bool = False,
                     figure_size: tuple = (10, 5),
                     ) -> pd.DataFrame:
@@ -653,8 +653,6 @@ def flow_select(self,
             List of candidate normalizing flow specifications.
         max_iter: int
             Maximum number of iterations for the optimization.
-        n_samples: int
-            Number of samples drawn from the fitted distribution.
         plot: bool
             If True, a density plot of the actual and fitted distribution is created.
         figure_size: tuple
@@ -692,11 +690,11 @@ def flow_select(self,
                          }
                     )
                 flow_list.append(fit_df)
-                fit_df = pd.concat(flow_list).sort_values(by=flow_sel.loss_fn, ascending=True)
-                fit_df["rank"] = fit_df[flow_sel.loss_fn].rank().astype(int)
-                fit_df.set_index(fit_df["rank"], inplace=True)
                 pbar.update(1)
             pbar.set_description(f"Fitting of candidate normalizing flows completed")
+            fit_df = pd.concat(flow_list).sort_values(by=flow_sel.loss_fn, ascending=True)
+            fit_df["rank"] = fit_df[flow_sel.loss_fn].rank().astype(int)
+            fit_df.set_index(fit_df["rank"], inplace=True)
 
         if plot:
             # Select normalizing flow with the lowest loss
@@ -713,29 +711,17 @@ def flow_select(self,
             flow_params = torch.tensor(best_flow["params"][0]).reshape(1, -1)
             flow_dist_sel = best_flow_sel.create_spline_flow(input_dim=1)
             _, flow_dist_sel = best_flow_sel.replace_parameters(flow_params, flow_dist_sel)
-            flow_samples = pd.DataFrame(flow_dist_sel.sample((n_samples,)).squeeze().detach().numpy().T)
+            n_samples = np.max([10000, target.shape[0]])
+            n_samples = np.where(n_samples > 500000, 100000, n_samples)
+            flow_samples = pd.DataFrame(flow_dist_sel.sample((n_samples,)).squeeze().detach().numpy().T).values
 
             # Plot actual and fitted distribution
-            flow_samples["type"] = f"Best-Fit: {best_flow['NormFlow'].values[0]}"
-
-            df_actual = pd.DataFrame(target)
-            df_actual["type"] = "Data"
-
-            plot_df = pd.concat([df_actual, flow_samples]).rename(columns={0: "variable"})
-
-            print(
-                ggplot(plot_df,
-                       aes(x="variable",
-                           color="type")) +
-                geom_density(size=1.1) +
-                theme_bw(base_size=15) +
-                theme(figure_size=figure_size,
-                      legend_position="right",
-                      legend_title=element_blank(),
-                      plot_title=element_text(hjust=0.5)) +
-                labs(title=f"Actual vs. Fitted Density",
-                     x="")
-            )
+            plt.figure(figsize=figure_size)
+            sns.kdeplot(target.reshape(-1, ), label="Actual")
+            sns.kdeplot(flow_samples.reshape(-1, ), label=f"Best-Fit: {best_flow['NormFlow'].values[0]}")
+            plt.legend()
+            plt.title("Actual vs. Best-Fit Density", fontweight="bold", fontsize=16)
+            plt.show()
 
         fit_df.drop(columns=["rank", "params"], inplace=True)