theislab
diff --git a/‎batchglm/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎batchglm/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎batchglm/api/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎batchglm/api/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎batchglm/api/utils/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎batchglm/api/utils/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎batchglm/api/utils/linalg.py‎
Lines changed: 1 addition & 0 deletions b/‎batchglm/api/utils/linalg.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎batchglm/api/utils/numeric.py‎
Lines changed: 1 addition & 0 deletions b/‎batchglm/api/utils/numeric.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎batchglm/benchmark/nb_glm/base.py‎
Lines changed: 30 additions & 6 deletions b/‎batchglm/benchmark/nb_glm/base.py‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎batchglm/benchmark/nb_glm/convergence.py‎
Lines changed: 86 additions & 29 deletions b/‎batchglm/benchmark/nb_glm/convergence.py‎
Lines changed: 86 additions & 29 deletions
@@ -3,4 +3,4 @@
 __version__ = get_versions()['version']
 del get_versions
 
-from .log_cfg import logger, unconfigure_logging, enable_logging
+from .log_cfg import logger, unconfigure_logging, setup_logging
@@ -1,5 +1,5 @@
 from .. import __version__
-from ..log_cfg import logger, unconfigure_logging, enable_logging
+from ..log_cfg import logger, unconfigure_logging, setup_logging
 
 from . import models
 from . import data
 
@@ -1,2 +1,4 @@
 from . import stats
 from . import random
+from . import numeric
+from . import linalg
@@ -0,0 +1 @@
+from batchglm.utils.linalg import stacked_lstsq, groupwise_solve_lm
@@ -0,0 +1 @@
+from batchglm.utils.numeric import combine_matrices, softmax, weighted_mean, weighted_variance
@@ -2,6 +2,7 @@
 
 import os
 import logging
+import traceback
 
 from collections import OrderedDict
 import itertools
@@ -14,6 +15,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def init_benchmark(
         root_dir: str,
         sim: Simulator,
@@ -110,10 +112,16 @@ def prepare_benchmark_sample(
     return sample_config
 
 
-def get_benchmark_samples(root_dir: str, config_file="config.yml"):
+def load_config(root_dir, config_file):
     config_file = os.path.join(root_dir, config_file)
     with open(config_file, mode="r") as f:
         config = yaml.load(f)
+
+    return config
+
+
+def get_benchmark_samples(root_dir: str, config_file="config.yml"):
+    config = load_config(root_dir, config_file)
     return list(config["benchmark_samples"].keys())
 
 
@@ -186,18 +194,29 @@ def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Sim
     benchmark_data = []
     for smpl, cfg in benchmark_samples.items():
         wd = cfg["working_dir"]
-        logger.info("opening working dir: %s", wd)
+
         ds_path = os.path.join(root_dir, wd, "cache.zarr")
+        ds_cache_OK = os.path.join(root_dir, wd, "cache_OK")
+
+        logger.info("opening working dir: %s", wd)
         try:  # try open zarr cache
+            if not os.path.exists(ds_cache_OK):
+                raise FileNotFoundError
+
             data = xr.open_zarr(ds_path)
             logger.info("using zarr cache: %s", os.path.join(wd, "cache.zarr"))
-        except:  # open netcdf4 files
+        except BaseException as e:  # open netcdf4 files
+            if isinstance(e, FileNotFoundError):
+                pass
+            else:
+                traceback.print_exc()
+
             logger.info("loading step-wise netcdf4 files...")
             ncdf_data = xr.open_mfdataset(
                 os.path.join(root_dir, cfg["working_dir"], "estimation-*.h5"),
-                engine="netcdf4",
+                engine="h5netcdf",
                 concat_dim="step",
-                autoclose=True,
+                # autoclose=True,
                 parallel=True,
             )
             ncdf_data = ncdf_data.sortby("global_step")
@@ -206,13 +225,18 @@ def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Sim
 
             try:  # try to save data in zarr cache
                 zarr_data = ncdf_data.to_zarr(ds_path)
+                touch(ds_cache_OK)
+
                 logger.info("Stored data in zarr cache")
 
                 # close netcdf4 data sets
                 ncdf_data.close()
                 del ncdf_data
                 data = zarr_data
-            except:  # use netcdf4 since zarr does not seem to work
+            except BaseException as e:  # use netcdf4 since zarr does not seem to work
+                traceback.print_exc()
+
+                logger.info("falling back to step-wise netcdf4 store")
                 data = ncdf_data
 
         benchmark_data.append(data)
 
@@ -4,20 +4,30 @@
 import shutil
 import logging
 
+import pandas as pd
 import scipy.stats
 import numpy as np
 import xarray as xr
 import yaml
 
-from .base import init_benchmark, get_benchmark_samples, run_benchmark, load_benchmark_dataset
+from .base import init_benchmark, get_benchmark_samples, run_benchmark, load_benchmark_dataset, load_config
 from .base import Simulator
 
 import batchglm.utils.stats as stat_utils
 
 logger = logging.getLogger(__name__)
 
 
-def plot_benchmark(root_dir: str, config_file="config.yml"):
+def group_by_training(benchmark_configs, keys=("optim_algo", "learning_rate")):
+    benchmark_df = pd.DataFrame.from_dict({
+        benchmark: [cfg["training_args"][k] for k in keys]
+        for benchmark, cfg in benchmark_configs.items()
+    }, orient='index', columns=keys)
+
+    return benchmark_df
+
+
+def plot_all_benchmarks(root_dir, config_file="config.yml"):
     logger.info("loading config...", end="", flush=True)
     config_file = os.path.join(root_dir, config_file)
     with open(config_file, mode="r") as f:
@@ -28,48 +38,71 @@ def plot_benchmark(root_dir: str, config_file="config.yml"):
 
     logger.info("loading data...", end="", flush=True)
     sim, benchmark_data = load_benchmark_dataset(root_dir)
-    benchmark_data.coords["time_elapsed"] = benchmark_data.time_elapsed.cumsum("step")
     logger.info("\t[OK]")
 
+    plot_benchmarks(
+        plot_dir=plot_dir,
+        sim=sim,
+        benchmark_data=benchmark_data,
+        benchmark_names=benchmark_data.coords["benchmark"]
+    )
+
+
+def plot_benchmarks(plot_dir: str, sim, benchmark_data, benchmark_names):
+    benchmark_data = benchmark_data.assign_coords(**{
+        "time_elapsed": benchmark_data.time_elapsed.cumsum("step"),
+    })
+    benchmark_data.coords["benchmark"] = xr.DataArray(
+        dims=("benchmark",),
+        data=benchmark_names
+    )
+
+    groupby_col = "benchmark"
+    linewidth = 0.5
+
     import plotnine as pn
     import matplotlib.pyplot as plt
+    plt.rcParams["legend.loc"] = "center left"
 
     from dask.diagnostics import ProgressBar
 
     def plot_stat(val, val_name, name_prefix, scale_y_log10=False):
         with ProgressBar():
-            df = val.to_dataframe(val_name).reset_index()
+            df = val.to_dataframe(val_name)
+        df = df.reset_index()
 
         plot = (pn.ggplot(df)
-                + pn.aes(x="time_elapsed", y=val_name, group="benchmark", color="benchmark")
+                + pn.aes(x="time_elapsed", y=val_name, group=groupby_col, color=groupby_col)
                 + pn.geom_line()
-                + pn.geom_vline(xintercept=df.location[[np.argmin(df[val_name])]].time_elapsed.values[0], color="black")
+                + pn.geom_vline(xintercept=df.loc[[np.argmin(df[val_name])]].time_elapsed.values[0], color="black")
                 + pn.geom_hline(yintercept=np.min(df[val_name]), alpha=0.5)
                 )
         if scale_y_log10:
             plot = plot + pn.scale_y_log10()
         plot.save(os.path.join(plot_dir, name_prefix + ".time.svg"), format="svg")
 
         plot = (pn.ggplot(df)
-                + pn.aes(x="global_step", y=val_name, group="benchmark", color="benchmark")
+                + pn.aes(x="global_step", y=val_name, group=groupby_col, color=groupby_col)
                 + pn.geom_line()
-                + pn.geom_vline(xintercept=df.location[[np.argmin(df[val_name])]].global_step.values[0], color="black")
+                + pn.geom_vline(xintercept=df.loc[[np.argmin(df[val_name])]].global_step.values[0], color="black")
                 + pn.geom_hline(yintercept=np.min(df[val_name]), alpha=0.5)
                 )
         if scale_y_log10:
             plot = plot + pn.scale_y_log10()
         plot.save(os.path.join(plot_dir, name_prefix + ".step.svg"), format="svg")
 
+        return df
+
     logger.info("plotting...")
     val: xr.DataArray = stat_utils.rmsd(
         np.exp(xr.DataArray(sim.params["a"][0], dims=("features",))),
         np.exp(benchmark_data.a.isel(design_loc_params=0)), axis=[0])
-    plot_stat(val, "mapd", "real_mu")
+    df = plot_stat(val, "mapd", "real_mu")
 
     val: xr.DataArray = stat_utils.rmsd(
         np.exp(xr.DataArray(sim.params["b"][0], dims=("features",))),
         np.exp(benchmark_data.b.isel(design_scale_params=0)), axis=[0])
-    plot_stat(val, "mapd", "real_r")
+    df = plot_stat(val, "mapd", "real_r")
 
     val: xr.DataArray = benchmark_data.loss
     plot_stat(val, "loss", "loss")
@@ -90,41 +123,65 @@ def plot_pval(window_size):
         t = t[:, window_size:]
         df = df[:, window_size:]
 
-        pval = t.copy()
-        pval[:, :] = scipy.stats.t(df).cdf(t)
-        pval.plot.line(hue="benchmark")
-        plt.savefig(os.path.join(plot_dir, "pval_convergence.%dsteps.svg" % window_size), format="svg")
-        # plt.show()
-        plt.close()
+        pval = xr.DataArray(
+            name="pval",
+            data=scipy.stats.t(df).cdf(t),
+            dims=t.dims,
+            coords=t.coords
+        )
+
+        fig, ax = plt.subplots()
+        lines = pval.plot.line(hue=groupby_col, linewidth=linewidth, ax=ax)
+        ax.get_legend().set_bbox_to_anchor((1, 0.5))
+        fig.savefig(os.path.join(plot_dir, "pval_convergence.%dsteps.svg" % window_size),
+                    format="svg", bbox_inches='tight')
+        # fig.show()
+        plt.close(fig)
 
+    plot_pval(25)
+    plot_pval(50)
     plot_pval(100)
     plot_pval(200)
     plot_pval(400)
 
-    benchmark_data.full_loss.plot.line(hue="benchmark")
-    plt.savefig(os.path.join(plot_dir, "full_loss.svg"), format="svg")
-    plt.close()
+    fig, ax = plt.subplots()
+    lines = benchmark_data.full_loss.plot.line(hue=groupby_col, linewidth=linewidth, ax=ax)
+    ax.set_ylabel('full loss')
+    ax.get_legend().set_bbox_to_anchor((1, 0.5))
+    fig.savefig(os.path.join(plot_dir, "full_loss.svg"), format="svg", bbox_inches='tight')
+    plt.close(fig)
 
-    benchmark_data.loss.plot.line(hue="benchmark")
-    plt.savefig(os.path.join(plot_dir, "batch_loss.svg"), format="svg")
-    plt.close()
+    fig, ax = plt.subplots()
+    lines = benchmark_data.loss.plot.line(hue=groupby_col, linewidth=linewidth, ax=ax)
+    ax.set_ylabel('batch loss')
+    ax.get_legend().set_bbox_to_anchor((1, 0.5))
+    fig.savefig(os.path.join(plot_dir, "batch_loss.svg"), format="svg", bbox_inches='tight')
+    plt.close(fig)
 
     def plot_loss_rolling_mean(window_size):
         logger.info("plotting rolling mean of batch loss with window size: %d" % window_size)
 
-        benchmark_data.loss.rolling(step=window_size).mean().plot.line(hue="benchmark")
-        plt.savefig(os.path.join(plot_dir, "batch_loss_rolling_mean.%dsteps.svg" % window_size), format="svg")
-        plt.close()
+        fig, ax = plt.subplots()
+        lines = benchmark_data.loss.rolling(step=window_size).mean().plot.line(
+            hue=groupby_col, linewidth=linewidth, ax=ax)
+        ax.set_ylabel('rolling mean')
+        ax.get_legend().set_bbox_to_anchor((1, 0.5))
+        fig.savefig(os.path.join(plot_dir, "batch_loss_rolling_mean.%dsteps.svg" % window_size),
+                    format="svg", bbox_inches='tight')
+        plt.close(fig)
 
     plot_loss_rolling_mean(25)
     plot_loss_rolling_mean(50)
     plot_loss_rolling_mean(100)
     plot_loss_rolling_mean(200)
 
+    fig, ax = plt.subplots()
     with ProgressBar():
-        benchmark_data.full_gradient.mean(dim="features").plot.line(hue="benchmark")
-    plt.savefig(os.path.join(plot_dir, "mean_full_gradient.svg"), format="svg")
-    plt.close()
+        lines = benchmark_data.full_gradient.mean(dim="features").plot.line(
+            hue=groupby_col, linewidth=linewidth, ax=ax)
+    ax.get_legend().set_bbox_to_anchor((1, 0.5))
+    fig.savefig(os.path.join(plot_dir, "mean_full_gradient.svg"), format="svg", bbox_inches='tight')
+    plt.close(fig)
 
     logger.info("ready")
 
@@ -214,7 +271,7 @@ def main():
         for smpl in benchmark_samples:
             logger.info(smpl)
     elif action == "plot":
-        plot_benchmark(root_dir)
+        plot_all_benchmarks(root_dir)
     elif action == "clean":
         clean(root_dir)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from batchglm.utils.linalg import stacked_lstsq, groupwise_solve_lm`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from batchglm.utils.numeric import combine_matrices, softmax, weighted_mean, weighted_variance`