Merge branch 'rsa_dev' into dev

Hoeze · Hoeze · commit 154e8717cac1 · 2018-10-29T15:02:09.000+01:00
diff --git a/batchglm/__init__.py b/batchglm/__init__.py
@@ -2,3 +2,5 @@
 
 __version__ = get_versions()['version']
 del get_versions
+
+from .log_cfg import logger, unconfigure_logging, enable_logging
diff --git a/batchglm/api/__init__.py b/batchglm/api/__init__.py
@@ -1,3 +1,6 @@
+from .. import __version__
+from ..log_cfg import logger, unconfigure_logging, enable_logging
+
 from . import models
 from . import data
 from . import utils
diff --git a/batchglm/benchmark/nb_glm/base.py b/batchglm/benchmark/nb_glm/base.py
@@ -1,13 +1,18 @@
+from typing import Tuple
+
 import os
+import logging
 
 from collections import OrderedDict
 import itertools
 
+import xarray as xr
 import pandas as pd
 import yaml
 
 from batchglm.api.models.nb_glm import Simulator, Estimator
 
+logger = logging.getLogger(__name__)
 
 def init_benchmark(
         root_dir: str,
@@ -166,3 +171,51 @@ def run_benchmark(root_dir: str, sample: str, config_file="config.yml"):
     os.remove(os.path.join(working_dir, "lock"))
     touch(os.path.join(working_dir, "ready"))
     print("\t[OK]")
+
+
+def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Simulator, xr.Dataset]:
+    config_file = os.path.join(root_dir, config_file)
+    with open(config_file, mode="r") as f:
+        config = yaml.load(f)
+
+    sim_data_file = os.path.join(root_dir, config["sim_data"])
+    sim = Simulator()
+    sim.load(sim_data_file)
+
+    benchmark_samples = config["benchmark_samples"]
+    benchmark_data = []
+    for smpl, cfg in benchmark_samples.items():
+        wd = cfg["working_dir"]
+        logger.info("opening working dir: %s", wd)
+        ds_path = os.path.join(root_dir, wd, "cache.zarr")
+        try:  # try open zarr cache
+            data = xr.open_zarr(ds_path)
+            logger.info("using zarr cache: %s", os.path.join(wd, "cache.zarr"))
+        except:  # open netcdf4 files
+            logger.info("loading step-wise netcdf4 files...")
+            ncdf_data = xr.open_mfdataset(
+                os.path.join(root_dir, cfg["working_dir"], "estimation-*.h5"),
+                engine="netcdf4",
+                concat_dim="step",
+                autoclose=True,
+                parallel=True,
+            )
+            ncdf_data = ncdf_data.sortby("global_step")
+            ncdf_data.coords["benchmark"] = smpl
+            logger.info("loading step-wise netcdf4 files ready")
+
+            try:  # try to save data in zarr cache
+                zarr_data = ncdf_data.to_zarr(ds_path)
+                logger.info("Stored data in zarr cache")
+
+                # close netcdf4 data sets
+                ncdf_data.close()
+                del ncdf_data
+                data = zarr_data
+            except:  # use netcdf4 since zarr does not seem to work
+                data = ncdf_data
+
+        benchmark_data.append(data)
+    benchmark_data = xr.auto_combine(benchmark_data, concat_dim="benchmark", coords="all")
+
+    return sim, benchmark_data
diff --git a/batchglm/benchmark/nb_glm/convergence.py b/batchglm/benchmark/nb_glm/convergence.py
@@ -2,57 +2,34 @@
 
 import os
 import shutil
+import logging
 
 import scipy.stats
 import numpy as np
 import xarray as xr
 import yaml
 
-from .base import init_benchmark, get_benchmark_samples, run_benchmark, Simulator
+from .base import init_benchmark, get_benchmark_samples, run_benchmark, load_benchmark_dataset
+from .base import Simulator
 
 import batchglm.utils.stats as stat_utils
 
-
-def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Simulator, xr.Dataset, dict]:
-    config_file = os.path.join(root_dir, config_file)
-    with open(config_file, mode="r") as f:
-        config = yaml.load(f)
-
-    sim_data_file = os.path.join(root_dir, config["sim_data"])
-    sim = Simulator()
-    sim.load(sim_data_file)
-
-    benchmark_samples = config["benchmark_samples"]
-    benchmark_data = []
-    for smpl, cfg in benchmark_samples.items():
-        data = xr.open_mfdataset(
-            os.path.join(root_dir, cfg["working_dir"], "estimation-*.h5"),
-            engine="netcdf4",
-            concat_dim="step",
-            autoclose=True,
-            parallel=True,
-        )
-        data = data.sortby("global_step")
-        data.coords["benchmark"] = smpl
-        benchmark_data.append(data)
-    benchmark_data = xr.auto_combine(benchmark_data, concat_dim="benchmark", coords="all")
-
-    return sim, benchmark_data, benchmark_samples
+logger = logging.getLogger(__name__)
 
 
 def plot_benchmark(root_dir: str, config_file="config.yml"):
-    print("loading config...", end="", flush=True)
+    logger.info("loading config...", end="", flush=True)
     config_file = os.path.join(root_dir, config_file)
     with open(config_file, mode="r") as f:
         config = yaml.load(f)
-    print("\t[OK]")
+    logger.info("\t[OK]")
 
     plot_dir = os.path.join(root_dir, config["plot_dir"])
 
-    print("loading data...", end="", flush=True)
-    sim, benchmark_data, benchmark_sample_config = load_benchmark_dataset(root_dir)
+    logger.info("loading data...", end="", flush=True)
+    sim, benchmark_data = load_benchmark_dataset(root_dir)
     benchmark_data.coords["time_elapsed"] = benchmark_data.time_elapsed.cumsum("step")
-    print("\t[OK]")
+    logger.info("\t[OK]")
 
     import plotnine as pn
     import matplotlib.pyplot as plt
@@ -83,7 +60,7 @@ def plot_stat(val, val_name, name_prefix, scale_y_log10=False):
             plot = plot + pn.scale_y_log10()
         plot.save(os.path.join(plot_dir, name_prefix + ".step.svg"), format="svg")
 
-    print("plotting...")
+    logger.info("plotting...")
     val: xr.DataArray = stat_utils.rmsd(
         np.exp(xr.DataArray(sim.params["a"][0], dims=("features",))),
         np.exp(benchmark_data.a.isel(design_loc_params=0)), axis=[0])
@@ -98,7 +75,7 @@ def plot_stat(val, val_name, name_prefix, scale_y_log10=False):
     plot_stat(val, "loss", "loss")
 
     def plot_pval(window_size):
-        print("plotting p-value with window size: %d" % window_size)
+        logger.info("plotting p-value with window size: %d" % window_size)
 
         roll1 = benchmark_data.loss.rolling(step=window_size)
         roll2 = benchmark_data.loss.roll(step=window_size).rolling(step=window_size)
@@ -133,7 +110,7 @@ def plot_pval(window_size):
     plt.close()
 
     def plot_loss_rolling_mean(window_size):
-        print("plotting rolling mean of batch loss with window size: %d" % window_size)
+        logger.info("plotting rolling mean of batch loss with window size: %d" % window_size)
 
         benchmark_data.loss.rolling(step=window_size).mean().plot.line(hue="benchmark")
         plt.savefig(os.path.join(plot_dir, "batch_loss_rolling_mean.%dsteps.svg" % window_size), format="svg")
@@ -149,7 +126,7 @@ def plot_loss_rolling_mean(window_size):
     plt.savefig(os.path.join(plot_dir, "mean_full_gradient.svg"), format="svg")
     plt.close()
 
-    print("ready")
+    logger.info("ready")
 
 
 def clean(root_dir: str):
@@ -161,7 +138,7 @@ def clean(root_dir: str):
             elif os.path.isdir(file_path):
                 shutil.rmtree(file_path)
         except Exception as e:
-            print(e)
+            logger.info(e)
 
 
 def main():
@@ -235,7 +212,7 @@ def main():
     elif action == "print_samples":
         benchmark_samples = get_benchmark_samples(root_dir)
         for smpl in benchmark_samples:
-            print(smpl)
+            logger.info(smpl)
     elif action == "plot":
         plot_benchmark(root_dir)
     elif action == "clean":
diff --git a/batchglm/benchmark/nb_glm/performance.py b/batchglm/benchmark/nb_glm/performance.py
@@ -2,16 +2,20 @@
 
 import os
 import shutil
+import logging
 
 import numpy as np
 import xarray as xr
 import yaml
 
-# import batchglm.pkg_constants
+from .base import load_benchmark_dataset, get_benchmark_samples
+
 from batchglm.api.models.nb_glm import Simulator, Estimator
 
 import batchglm.utils.stats as stat_utils
 
+logger = logging.getLogger(__name__)
+
 
 def init_benchmark(
         root_dir: str,
@@ -99,13 +103,6 @@ def prepare_benchmark_sample(
     return sample_config
 
 
-def get_benchmark_samples(root_dir: str, config_file="config.yml"):
-    config_file = os.path.join(root_dir, config_file)
-    with open(config_file, mode="r") as f:
-        config = yaml.load(f)
-    return list(config["benchmark_samples"].keys())
-
-
 def run_benchmark(root_dir: str, sample: str, config_file="config.yml"):
     config_file = os.path.join(root_dir, config_file)
     with open(config_file, mode="r") as f:
@@ -122,58 +119,31 @@ def run_benchmark(root_dir: str, sample: str, config_file="config.yml"):
     init_args = sample_config["init_args"]
     init_args["working_dir"] = working_dir
 
-    print("loading data...", end="", flush=True)
+    logger.info("loading data...", end="", flush=True)
     sim = Simulator()
     sim.load(sim_data_file)
-    print("\t[OK]")
+    logger.info("\t[OK]")
 
-    print("starting estimation of benchmark sample '%s'..." % sample)
+    logger.info("starting estimation of benchmark sample '%s'..." % sample)
     estimator = Estimator(sim.input_data, batch_size=batch_size)
     estimator.initialize(**init_args)
     estimator.train(learning_rate=learning_rate)
-    print("estimation of benchmark sample '%s' ready" % sample)
-
-
-def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Simulator, xr.Dataset]:
-    config_file = os.path.join(root_dir, config_file)
-    with open(config_file, mode="r") as f:
-        config = yaml.load(f)
-
-    sim_data_file = os.path.join(root_dir, config["sim_data"])
-    sim = Simulator()
-    sim.load(sim_data_file)
-
-    benchmark_samples = config["benchmark_samples"]
-    benchmark_data = []
-    for smpl, cfg in benchmark_samples.items():
-        data = xr.open_mfdataset(
-            os.path.join(root_dir, cfg["working_dir"], "estimation-*.h5"),
-            engine="netcdf4",
-            concat_dim="step",
-            autoclose=True,
-            parallel=True,
-        )
-        data = data.sortby("global_step")
-        data.coords["benchmark"] = smpl
-        benchmark_data.append(data)
-    benchmark_data = xr.auto_combine(benchmark_data, concat_dim="benchmark", coords="all")
-
-    return sim, benchmark_data
+    logger.info("estimation of benchmark sample '%s' ready" % sample)
 
 
 def plot_benchmark(root_dir: str, config_file="config.yml"):
-    print("loading config...", end="", flush=True)
+    logger.info("loading config...", end="", flush=True)
     config_file = os.path.join(root_dir, config_file)
     with open(config_file, mode="r") as f:
         config = yaml.load(f)
-    print("\t[OK]")
+    logger.info("\t[OK]")
 
     plot_dir = os.path.join(root_dir, config["plot_dir"])
 
-    print("loading data...", end="", flush=True)
+    logger.info("loading data...", end="", flush=True)
     sim, benchmark_data = load_benchmark_dataset(root_dir)
     benchmark_data.coords["time_elapsed"] = benchmark_data.time_elapsed.cumsum("step")
-    print("\t[OK]")
+    logger.info("\t[OK]")
 
     import plotnine as pn
     import matplotlib.pyplot as plt
@@ -204,7 +174,7 @@ def plot_stat(val, val_name, name_prefix, scale_y_log10=False):
             plot = plot + pn.scale_y_log10()
         plot.save(os.path.join(plot_dir, name_prefix + ".step.svg"), format="svg")
 
-    print("plotting...")
+    logger.info("plotting...")
     val: xr.DataArray = stat_utils.rmsd(
         np.exp(xr.DataArray(sim.params["a"][0], dims=("features",))),
         np.exp(benchmark_data.a.isel(design_loc_params=0)), axis=[0])
@@ -218,7 +188,7 @@ def plot_stat(val, val_name, name_prefix, scale_y_log10=False):
     val: xr.DataArray = benchmark_data.loss
     plot_stat(val, "loss", "loss")
 
-    print("ready")
+    logger.info("ready")
 
 
 def clean(root_dir: str):
@@ -230,7 +200,7 @@ def clean(root_dir: str):
             elif os.path.isdir(file_path):
                 shutil.rmtree(file_path)
         except Exception as e:
-            print(e)
+            logger.info(e)
 
 
 def main():
@@ -293,7 +263,7 @@ def main():
     elif action == "print_samples":
         benchmark_samples = get_benchmark_samples(root_dir)
         for smpl in benchmark_samples:
-            print(smpl)
+            logger.info(smpl)
     elif action == "plot":
         plot_benchmark(root_dir)
     elif action == "clean":
diff --git a/batchglm/log_cfg.py b/batchglm/log_cfg.py
@@ -0,0 +1,31 @@
+import sys
+
+import logging
+
+logger = logging.getLogger('.'.join(__name__.split('.')[:-1]))
+
+_is_interactive = bool(getattr(sys, 'ps1', sys.flags.interactive))
+_hander = None
+
+
+def unconfigure_logging():
+    if _hander is not None:
+        logger.removeHandler(_hander)
+
+    logger.setLevel(logging.NOTSET)
+
+
+def enable_logging(verbosity=logging.ERROR, stream=sys.stderr, format=logging.BASIC_FORMAT):
+    unconfigure_logging()
+
+    logger.setLevel(verbosity)
+    _handler = logging.StreamHandler(stream)
+    _handler.setFormatter(logging.Formatter(format, None))
+    logger.addHandler(_handler)
+
+
+# If we are in an interactive environment (like Jupyter), set loglevel to INFO and pipe the output to stdout.
+if _is_interactive:
+    enable_logging(logging.INFO, sys.stdout)
+else:
+    enable_logging(logging.WARNING, sys.stderr)
diff --git a/batchglm/train/tf/nb_glm/base.py b/batchglm/train/tf/nb_glm/base.py
@@ -7,11 +7,6 @@
 
 import numpy as np
 
-try:
-    import anndata
-except ImportError:
-    anndata = None
-
 from .external import AbstractEstimator
 from .external import nb_utils
 from .external import pkg_constants
diff --git a/batchglm/train/tf/nb_glm/jacobians.py b/batchglm/train/tf/nb_glm/jacobians.py
diff --git a/docs/conf.py b/docs/conf.py