add hashing to check if update is needed

Jhsmit · Jhsmit · commit 9a9bc8903890 · 2021-12-14T17:50:49.000+01:00
diff --git a/pyhdx/support.py b/pyhdx/support.py
@@ -13,6 +13,26 @@
 from dask.distributed import Client
 
 
+def make_tuple(item):
+    if isinstance(item, list):
+        return tuple(make_tuple(i) for i in item)
+    elif isinstance(item, dict):
+        return tuple((key, make_tuple(value)) for key, value in item.items())
+    else:
+        return item
+
+
+def hash_dataframe(df):
+    try:
+        tup = (*pd.util.hash_pandas_object(df, index=True).values, *df.columns, *df.columns.names, df.index.name)
+
+    except TypeError:
+        print(df)
+        print('hoi')
+
+    return hash(tup)
+
+
 def multiindex_apply_function(
     index: pd.MultiIndex,
     level: int,
diff --git a/pyhdx/web/controllers.py b/pyhdx/web/controllers.py
@@ -772,14 +772,17 @@ def _action_add_comparison(self):
 
         combined = pd.concat([ddG, cov], axis=1)
 
+        #todo use _add_table method on source
         if current_df is not None:
             new_df = pd.concat([current_df, combined], axis=1)
         else:
             new_df = combined
 
-        self.parent.sources['main'].tables['ddG_comparison'] = new_df
-        self.parent.sources['main'].param.trigger('tables')  #todo check/remove tables trigger
-        self.parent.sources['main'].updated = True
+        #self.parent.sources['main'].tables['ddG_comparison'] = new_df
+        self.src.add_table('ddG_comparison', new_df)
+
+        #self.parent.sources['main'].param.trigger('tables')  #todo check/remove tables trigger
+        self.src.updated = True
 
 
 class ColorTransformControl(PyHDXControlPanel):
diff --git a/pyhdx/web/sources.py b/pyhdx/web/sources.py
@@ -7,7 +7,7 @@
 from pyhdx import TorchFitResult
 from pyhdx.fitting import RatesFitResult
 from pyhdx.models import HDXMeasurement, HDXMeasurementSet
-from pyhdx.support import multiindex_astype, multiindex_set_categories
+from pyhdx.support import multiindex_astype, multiindex_set_categories, hash_dataframe
 
 
 class Source(param.Parameterized):
@@ -39,6 +39,13 @@ def get(self):
         else:
             raise ValueError("TableSource has multiple tables, use `get_table`")
 
+    def add_table(self, table: str, df: pd.DataFrame):
+        table_hash = hash_dataframe(df)
+        self.hashes[table] = table_hash
+        self.tables[table] = df
+
+        #todo self.updated = True?
+
     def get_table(self, table):
         df = self.tables.get(table, None)
 
@@ -199,7 +206,8 @@ def _add_table(self, df, table, categorical=True):
         if categorical:
             new.columns = multiindex_astype(new.columns, 0, 'category')
             new.columns = multiindex_set_categories(new.columns, 0, categories, ordered=True)
-        self.tables[table] = new
+
+        self.add_table(table, new)
 
 
 class PDBSource(Source):
@@ -208,6 +216,8 @@ class PDBSource(Source):
 
     pdb_files = param.Dict({}, doc='Dictionary with id: pdb_string pdb file entries')
 
+    hashes = param.Dict({})
+
     max_entries = param.Number(
         1,
         doc='set maximum size for pdb files. set to none for infinite size. set to one for single pdb mode')
@@ -219,11 +229,13 @@ def add_from_pdb(self, pdb_id):
             pdb_string = response.read().decode()
 
         self.pdb_files[pdb_id] = pdb_string
+        self.hashes[pdb_id] = hash(pdb_string)
         self.updated = True
 
     def add_from_string(self, pdb_string, pdb_id):
         self._make_room()
         self.pdb_files[pdb_id] = pdb_string
+        self.hashes[pdb_id] = hash(pdb_string)
         self.updated = True
 
     def _make_room(self):
@@ -233,9 +245,10 @@ def _make_room(self):
         elif len(self.pdb_files) == self.max_entries:
             key = next(iter(self.pdb_files))
             del self.pdb_files[key]
+            del self.hashes[key]
 
     def get(self):
-        """returns the first entry in the """
+        """returns the first entry in the pdb source"""
         return next(iter(self.pdb_files.values()))
 
     def get_pdb(self, pdb_id):
diff --git a/pyhdx/web/transforms.py b/pyhdx/web/transforms.py
@@ -1,27 +1,33 @@
 import itertools
+import warnings
 
 import numpy as np
 import pandas as pd
 import panel as pn
 import param
 from param.parameterized import default_label_formatter
 
-from pyhdx.support import autowrap
+from pyhdx.support import autowrap, make_tuple
 from pyhdx.web.sources import Source
+from pyhdx.web.cache import Cache
 
 
 class Transform(param.Parameterized):
-    """these transforms get the data from source"""
+    """Gets data and applies transform"""
 
     _type = 'base'
 
     widgets = param.Dict(default={})
 
+    source = param.ClassSelector(class_=Source)
+
     updated = param.Event()
 
     redrawn = param.Event(doc="event gets triggered when widgets are changed and the controller needs to redraw them")
 
-    cache = param.ClassSelector(class_=Cache)
+    _hash = param.Integer(doc='Hash of current transform state')
+
+    _cache = param.ClassSelector(default=None, class_=Cache)
 
     def __init__(self, **params):
         super().__init__(**params)
@@ -30,14 +36,40 @@ def get(self):
         """method called to get the dataframe"""
         return None
 
+    # perhaps htey should all be private to prevent namespace collision with filter options
+    @property
+    def source_hash(self):
+        return self.source.hash
+
+    @property
+    def hash_key(self):
+        """hashable key describing the transform"""
+        return tuple((item, make_tuple(val)) for item, val in self.param.get_param_values() if not item.startswith('_'))
+
+    @property
+    def hash(self):
+        tup = (*self.hash_key, self.source_hash)
+
+        return hash(tup)
+
+    def update_hash(self):
+        if self.hash == self._hash:
+            return False
+        else:
+            self._hash = self.hash
+            return True
+
+    def update(self):
+        if self.update_hash():
+            self._update_options()
+            self.updated = True
+
 
 class TableSourceTransform(Transform):
     """transform which picks the correct table from the source"""
 
     _type = 'table_source'
 
-    source = param.ClassSelector(class_=Source)
-
     table = param.Selector(default=None, doc="""
       The table being transformed. """)
 
@@ -56,14 +88,8 @@ def get(self):
         return df
 
     @property
-    def hash(self):
-        # or sources can have multiple hashes?
-        # / objects can have multiple hashes?
-        return tuple([self._type, self.table, self.source.hashes[self.table]])
-
-    @param.depends('table', watch=True)
-    def _table_updated(self):
-        self.updated = True
+    def source_hash(self):
+        return self.source.hashes.get(self.table, hash(None))
 
     def _update_options(self):
         options = self.source.get_tables()
@@ -73,10 +99,11 @@ def _update_options(self):
         if not self.table and options:
             self.table = options[0]
 
-    @param.depends('source.updated', watch=True)
+    @param.depends('source.updated', 'table', watch=True)
     def update(self):
         self._update_options()
-        self.updated = True
+        if self.update_hash():
+            self.updated = True
 
 
 class AppTransform(Transform):
@@ -116,43 +143,37 @@ class CrossSectionTransform(AppTransform):
     empty_select = param.Boolean(default=False, doc="""
         Add an option to Select widgets to indicate select all on this level.""")
 
-    # stepwise = param.Boolean(
-    #     default=False,
-    #     doc='Apply xs stepwise (one call per level)'
-    # )
-
     def __init__(self, **params):
         super().__init__(**params)
         self.index = None  # index is the df index which determines the selector's options
         self.update()
 
     @param.depends('source.updated', watch=True)
     def update(self):
-        #todo only redraw if only options are changed or always?
-        #todo remove watchers when new transforms are created?
-
-
-        old_index = self.index
-        df = self.source.get()
-
-        if df is None:
-            return
-        self.index = df.columns if self.axis else df.index
-        self._names = self.names or self.index.names
-
-        if old_index is not None and self.index.nlevels == old_index.nlevels:
-            # no redraw needed, only update selectors options
-            options = list(self.index.unique(level=0))
-            self.selectors[0].options = options
-            self.selectors[0].param.trigger('value')
-            for name, selector in zip(self._names, self.selectors):
-                selector.name = name  # todo requires testing if the names are really updated or not (they arent)
-                selector.label = name  # todo requires testing if the names are really updated or not
-                self.redrawn = True
-        else:
-            self.redraw()
+        if self.update_hash():
+            #todo remove watchers when new transforms are created?
+
+            old_index = self.index
+            df = self.source.get()
+
+            if df is None:
+                return
+            self.index = df.columns if self.axis else df.index
+            self._names = self.names or self.index.names
+
+            if old_index is not None and self.index.nlevels == old_index.nlevels:
+                # no redraw needed, only update selectors options
+                options = list(self.index.unique(level=0))
+                self.selectors[0].options = options
+                self.selectors[0].param.trigger('value')
+                for name, selector in zip(self._names, self.selectors):
+                    selector.name = name  # todo requires testing if the names are really updated or not (they arent)
+                    selector.label = name  # todo requires testing if the names are really updated or not
+                    self.redrawn = True
+            else:
+                self.redraw()
 
-        self.updated = True
+            self.updated = True
 
     def redraw(self):
         # create new widgets
@@ -163,6 +184,7 @@ def redraw(self):
 
         self.widgets = {name: pn.widgets.Select(name=default_label_formatter(name)) for name in self._names[:n_levels]}
 
+        #todo perhaps do self.param.add_parameter?
         self.selectors = list(self.widgets.values())
         for selector in self.selectors:
             selector.param.watch(self._selector_changed, ['value'], onlychanged=True)
@@ -237,6 +259,7 @@ class ApplyCmapOptTransform(AppTransform):
     #def check_args(... )  #todo method for constructor to see if the supplied kwargs are correct for this object
 
     def __init__(self, opts, **params): #opts: list of opts objects
+        warnings.warn('ApplyCmapOptTransform does not implement hashing', NotImplementedError)
         self._opts_dict = {o.name: o for o in opts}
         opts = list(self._opts_dict.keys())
         params['opts'] = opts
@@ -349,6 +372,8 @@ class GenericTransform(AppTransform):
 
     pd_function = param.String()
 
+    kwargs = param.Dict(doc='dict of additional kwargs')
+
     def __init__(self, **params):
         self.kwargs = {k: v for k, v in params.items() if k not in self.param}
         super().__init__(**{k: v for k, v in params.items() if k in self.param})
@@ -467,7 +492,7 @@ def __init__(self, **params):
 
 
 class SampleTransform(AppTransform):
-    """subsamples dataframe along """
+    """subsamples dataframe along specified axis"""
 
     _type = 'sample'
 
@@ -513,12 +538,6 @@ def get(self):
         return df
 
 
-class TransformTransform(AppTransform):
-    pd_function = param.String('transform')
-    def __init__(self, **params):
-        raise NotImplementedError()
-
-
 class PipeTransform(AppTransform):
     """applies a list of pandas functions