Skip to content

Commit ed703da

Browse files
authored
Merge pull request #242 from Jhsmit/cache
Cache
2 parents 2d7b0ad + b3a502d commit ed703da

File tree

7 files changed

+267
-82
lines changed

7 files changed

+267
-82
lines changed

pyhdx/support.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,26 @@
1313
from dask.distributed import Client
1414

1515

16+
def make_tuple(item):
17+
if isinstance(item, list):
18+
return tuple(make_tuple(i) for i in item)
19+
elif isinstance(item, dict):
20+
return tuple((key, make_tuple(value)) for key, value in item.items())
21+
else:
22+
return item
23+
24+
25+
def hash_dataframe(df):
26+
try:
27+
tup = (*pd.util.hash_pandas_object(df, index=True).values, *df.columns, *df.columns.names, df.index.name)
28+
29+
except TypeError:
30+
print(df)
31+
print('hoi')
32+
33+
return hash(tup)
34+
35+
1636
def multiindex_apply_function(
1737
index: pd.MultiIndex,
1838
level: int,

pyhdx/web/apps.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
from pathlib import Path
32

43
import panel as pn
@@ -7,14 +6,18 @@
76
from pyhdx import VERSION_STRING
87
from pyhdx.web.constructor import AppConstructor
98
from pyhdx.web.log import logger
9+
from pyhdx.web.cache import MemoryCache, HybridHDFCache
10+
11+
cache = MemoryCache(max_items=2000)
1012

13+
#cache = HybridHDFCache(file_path ='test123.h5')
1114

1215
@logger('pyhdx')
1316
def main_app():
1417
cwd = Path(__file__).parent.resolve()
1518
yaml_dict = yaml.safe_load((cwd / 'pyhdx_app.yaml').read_text(encoding='utf-8'))
1619

17-
ctr = AppConstructor(loggers={'pyhdx': main_app.logger})
20+
ctr = AppConstructor(loggers={'pyhdx': main_app.logger}, cache=cache)
1821

1922
ctrl = ctr.parse(yaml_dict)
2023

pyhdx/web/cache.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import param
2+
import pandas as pd
3+
4+
5+
class Cache(param.Parameterized):
6+
7+
def __getitem__(self, item):
8+
return None
9+
10+
def __setitem__(self, key, value):
11+
pass
12+
13+
def __contains__(self, item):
14+
return False
15+
16+
17+
class MemoryCache(Cache):
18+
19+
_cache = param.Dict(default={})
20+
21+
max_items = param.Integer(
22+
None,
23+
doc='Maximum number of items allowed in the cache'
24+
)
25+
26+
def __getitem__(self, item):
27+
return self._cache.__getitem__(item)
28+
29+
def __setitem__(self, key, value):
30+
if self.max_items is not None and len(self._cache) >= self.max_items:
31+
self._cache.popitem()
32+
33+
self._cache[key] = value
34+
35+
def __contains__(self, item):
36+
return item in self._cache
37+
38+
39+
class HybridHDFCache(Cache):
40+
"""
41+
42+
Hybrid HDFStore / Memory cache
43+
44+
Sometimes there are errors depending on the dtypes of dataframes stored
45+
46+
"""
47+
file_path = param.String()
48+
49+
_store = param.ClassSelector(class_=pd.HDFStore)
50+
51+
_cache = param.Dict(default={})
52+
53+
bytes_threshold = param.Integer(default=int(1e8))
54+
55+
def __init__(self, **params):
56+
super().__init__(**params)
57+
if self.file_path is not None:
58+
self._store = pd.HDFStore(self.file_path)
59+
60+
def __getitem__(self, item):
61+
key = str(item)
62+
try:
63+
return self._cache.__getitem__(key)
64+
except KeyError:
65+
return self._store.__getitem__(key)
66+
67+
def _store_put(self, key, value):
68+
try:
69+
self._store[key] = value
70+
71+
# Check if reading back the dataframe works
72+
try:
73+
_value = self._store[key]
74+
except AttributeError:
75+
del self._store[key]
76+
self._cache[key] = value
77+
78+
except (NotImplementedError, TypeError): # pytables does not support categorical dtypes
79+
self._cache[key] = value
80+
81+
def __setitem__(self, key, value):
82+
key = str(key)
83+
if isinstance(value, pd.DataFrame) and value.memory_usage().sum() > self.bytes_threshold:
84+
self._store_put(key, value)
85+
elif isinstance(value, pd.Series) and value.memory_usage() > self.bytes_threshold:
86+
self._store_put(key, value)
87+
else:
88+
self._cache[str(key)] = value
89+
90+
def __contains__(self, item):
91+
return str(item) in self._cache.keys() | self._store.keys()
92+
93+
# todo with statement for creating caches?
94+
# def __exit__(self):
95+
# pass

pyhdx/web/constructor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
from pyhdx.web.tools import supported_tools
1212
from pyhdx.web.transforms import *
1313
from pyhdx.web.views import View
14+
from pyhdx.web.cache import Cache
1415

1516
element_count = 0
1617

18+
1719
class AppConstructor(param.Parameterized):
1820

1921
sources = param.Dict(default={})
@@ -32,6 +34,8 @@ class AppConstructor(param.Parameterized):
3234

3335
client = param.ClassSelector(default=None, class_=Client)
3436

37+
cache = param.ClassSelector(default=Cache(), class_=Cache)
38+
3539
def __init__(self, **params):
3640
super().__init__(**params)
3741
self.classes = self.find_classes()
@@ -105,12 +109,21 @@ def _parse_sections(self, yaml_dict):
105109
obj = self.create_element(name, element, **spec)
106110
element_dict[name] = obj
107111

108-
def create_element(self, name, element, **spec):
112+
def create_element(self, name: str, element: str, **spec):
113+
"""
114+
115+
:param name:
116+
:param element: eiter source, filter, opt, view, tool
117+
:param spec:
118+
:return:
119+
"""
109120
global element_count
110121

111122
_type = spec.pop('type')
112123
kwargs = self._resolve_kwargs(**spec)
113124
class_ = self._resolve_class(_type, element)
125+
if element == 'transform':
126+
kwargs['_cache'] = self.cache
114127
obj = class_(name=name, **kwargs)
115128
element_count += 1
116129

pyhdx/web/controllers.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,17 @@ def _action_debug(self):
7676
print('break')
7777

7878
def _action_test(self):
79-
trs = self.transforms['table_1_select']
80-
print(trs.widgets)
79+
trs = self.transforms['peptide_select']
80+
cache = trs._cache
81+
print(cache._cache.keys())
82+
print(cache)
83+
print(cache._store.keys())
84+
85+
for item in cache._store.keys():
86+
print(item)
87+
print(cache[item])
88+
8189

82-
view = self.views['graph_1']
83-
df = view.get_data()
84-
print(df)
8590

8691
@property
8792
def _layout(self):
@@ -772,14 +777,17 @@ def _action_add_comparison(self):
772777

773778
combined = pd.concat([ddG, cov], axis=1)
774779

780+
#todo use _add_table method on source
775781
if current_df is not None:
776782
new_df = pd.concat([current_df, combined], axis=1)
777783
else:
778784
new_df = combined
779785

780-
self.parent.sources['main'].tables['ddG_comparison'] = new_df
781-
self.parent.sources['main'].param.trigger('tables') #todo check/remove tables trigger
782-
self.parent.sources['main'].updated = True
786+
#self.parent.sources['main'].tables['ddG_comparison'] = new_df
787+
self.src.add_table('ddG_comparison', new_df)
788+
789+
#self.parent.sources['main'].param.trigger('tables') #todo check/remove tables trigger
790+
self.src.updated = True
783791

784792

785793
class ColorTransformControl(PyHDXControlPanel):

pyhdx/web/sources.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pyhdx import TorchFitResult
88
from pyhdx.fitting import RatesFitResult
99
from pyhdx.models import HDXMeasurement, HDXMeasurementSet
10-
from pyhdx.support import multiindex_astype, multiindex_set_categories
10+
from pyhdx.support import multiindex_astype, multiindex_set_categories, hash_dataframe
1111

1212

1313
class Source(param.Parameterized):
@@ -23,7 +23,13 @@ def get(self):
2323

2424
class TableSource(Source):
2525

26-
tables = param.Dict({})
26+
tables = param.Dict(
27+
default={},
28+
doc="Dictionary of tables (pd.DataFrames)")
29+
30+
hashes = param.Dict(
31+
default={},
32+
doc="Dictionary of table hashes")
2733

2834
_type = 'table'
2935

@@ -33,6 +39,13 @@ def get(self):
3339
else:
3440
raise ValueError("TableSource has multiple tables, use `get_table`")
3541

42+
def add_table(self, table: str, df: pd.DataFrame):
43+
table_hash = hash_dataframe(df)
44+
self.hashes[table] = table_hash
45+
self.tables[table] = df
46+
47+
#todo self.updated = True?
48+
3649
def get_table(self, table):
3750
df = self.tables.get(table, None)
3851

@@ -193,7 +206,8 @@ def _add_table(self, df, table, categorical=True):
193206
if categorical:
194207
new.columns = multiindex_astype(new.columns, 0, 'category')
195208
new.columns = multiindex_set_categories(new.columns, 0, categories, ordered=True)
196-
self.tables[table] = new
209+
210+
self.add_table(table, new)
197211

198212

199213
class PDBSource(Source):
@@ -202,6 +216,8 @@ class PDBSource(Source):
202216

203217
pdb_files = param.Dict({}, doc='Dictionary with id: pdb_string pdb file entries')
204218

219+
hashes = param.Dict({})
220+
205221
max_entries = param.Number(
206222
1,
207223
doc='set maximum size for pdb files. set to none for infinite size. set to one for single pdb mode')
@@ -213,11 +229,13 @@ def add_from_pdb(self, pdb_id):
213229
pdb_string = response.read().decode()
214230

215231
self.pdb_files[pdb_id] = pdb_string
232+
self.hashes[pdb_id] = hash(pdb_string)
216233
self.updated = True
217234

218235
def add_from_string(self, pdb_string, pdb_id):
219236
self._make_room()
220237
self.pdb_files[pdb_id] = pdb_string
238+
self.hashes[pdb_id] = hash(pdb_string)
221239
self.updated = True
222240

223241
def _make_room(self):
@@ -227,9 +245,10 @@ def _make_room(self):
227245
elif len(self.pdb_files) == self.max_entries:
228246
key = next(iter(self.pdb_files))
229247
del self.pdb_files[key]
248+
del self.hashes[key]
230249

231250
def get(self):
232-
"""returns the first entry in the """
251+
"""returns the first entry in the pdb source"""
233252
return next(iter(self.pdb_files.values()))
234253

235254
def get_pdb(self, pdb_id):

0 commit comments

Comments
 (0)