Skip to content

Commit e42fb3b

Browse files
committed
>> added support for dumping/saving scalars (HDF5 + pickle formats).
>> removed pickle/hdf groups __axes__ and __groups__ since they are not required.
1 parent d7627ac commit e42fb3b

File tree

5 files changed

+103
-84
lines changed

5 files changed

+103
-84
lines changed

larray/core/session.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
394394
>>> # the path of file to the Session constructor
395395
>>> ses = Session('input.h5')
396396
>>> ses
397-
Session(i, s, a, b, a01, arr1, arr2)
397+
Session(a, a01, arr1, arr2, b, i, s)
398398
>>> ses.meta
399399
title: my title
400400
author: John Smith
@@ -404,11 +404,11 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
404404
>>> ses = Session()
405405
>>> ses.load('input.h5', names=['s', 'a', 'b', 'arr1', 'arr2'], display=True)
406406
opening input.h5
407-
loading scalar object s ... done
408407
loading Axis object a ... done
409-
loading Axis object b ... done
410408
loading Array object arr1 ... done
411409
loading Array object arr2 ... done
410+
loading Axis object b ... done
411+
loading str object s ... done
412412
413413
Using .csv files (assuming the same session as above)
414414

larray/inout/common.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
from collections import OrderedDict
66

77
from larray.util.compat import basestring
8+
from larray.core.axis import Axis
9+
from larray.core.group import Group
810
from larray.core.array import Array
911

1012

13+
# all formats
14+
_supported_larray_types = (Axis, Group, Array)
15+
1116
# only for HDF5 and pickle formats
12-
# XXX: also include scalars in Excel and CSV formats (via __scalars__ special sheet/.csv file)?
13-
# But then why saving scalars but not Axis and Group objects. That would be inconsistent.
14-
# XXX: support list and dict?
15-
_supported_scalar_types = (int, float, bool, basestring, date, time, datetime)
17+
# support list, tuple and dict?
18+
_supported_scalars_types = (int, float, bool, basestring, date, time, datetime)
1619

1720

1821
def _get_index_col(nb_axes=None, index_col=None, wide=True):

larray/inout/hdf.py

Lines changed: 81 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,36 @@
1313
from larray.core.metadata import Metadata
1414
from larray.util.misc import LHDFStore
1515
from larray.inout.session import register_file_handler
16-
from larray.inout.common import FileHandler, _supported_scalar_types
16+
from larray.inout.common import FileHandler, _supported_larray_types, _supported_scalars_types
1717
from larray.inout.pandas import df_asarray
1818
from larray.example import get_example_filepath
1919

2020

21+
_hdf_supported_types = _supported_larray_types + _supported_scalars_types
22+
23+
24+
class ScalarHDF(object):
25+
def __init__(self, value):
26+
_type = type(value).__name__
27+
if not isinstance(value, _supported_scalars_types):
28+
raise TypeError("Type {} which is not currently supported by the HDF5 format".format(_type))
29+
self.value = value
30+
self._type = _type
31+
32+
def to_hdf(self, filepath, key):
33+
key = _translate_group_key_hdf(key)
34+
s = pd.Series(data=self.value)
35+
with LHDFStore(filepath) as store:
36+
store.put(key, s)
37+
store.get_storer(key).attrs.type = self._type
38+
39+
40+
# for backward compatibility (larray < 0.29) but any object read from an hdf file should have
41+
# an attribute 'type'
42+
def _get_type_from_attrs(attrs):
43+
return attrs.type if 'type' in attrs else 'Array'
44+
45+
2146
def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False,
2247
name=None, **kwargs):
2348
r"""Reads a scalar or an axis or group or array named key from a HDF5 file in filepath (path+name)
@@ -73,53 +98,51 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
7398
with LHDFStore(filepath_or_buffer) as store:
7499
try:
75100
pd_obj = store.get(key)
76-
attrs = store.get_storer(key).attrs
77-
writer = attrs.writer if 'writer' in attrs else None
78-
# for backward compatibility but any object read from an hdf file should have an attribute 'type'
79-
_type = attrs.type if 'type' in attrs else 'Array'
80-
_meta = attrs.metadata if 'metadata' in attrs else None
81-
if _type == 'Array':
82-
# cartesian product is not necessary if the array was written by LArray
83-
cartesian_prod = writer != 'LArray'
84-
res = df_asarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
85-
parse_header=False, cartesian_prod=cartesian_prod)
86-
if _meta is not None:
87-
res.meta = _meta
88-
elif _type == 'Axis':
89-
if name is None:
90-
name = str(pd_obj.name)
91-
if name == 'None':
92-
name = None
93-
labels = pd_obj.values
94-
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
95-
# this check is there because there are cases where dtype_kind is 'U' but pandas returns
96-
# an array with object dtype containing bytes instead of a string array, and in that case
97-
# np.char.decode does not work
98-
# this is at least the case for Python2 + Pandas 0.24.2 combination
99-
if labels.dtype.kind == 'O':
100-
labels = np.array([l.decode('utf-8') for l in labels], dtype='U')
101-
else:
102-
labels = np.char.decode(labels, 'utf-8')
103-
res = Axis(labels=labels, name=name)
104-
res._iswildcard = attrs['wildcard']
105-
elif _type == 'Group':
106-
if name is None:
107-
name = str(pd_obj.name)
108-
if name == 'None':
109-
name = None
110-
key = pd_obj.values
111-
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
112-
key = np.char.decode(key, 'utf-8')
113-
axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
114-
res = LGroup(key=key, name=name, axis=axis)
115-
elif _type == 'scalar':
116-
res = pd_obj.values
117-
# XXX : assert len(res) == 1 ?
118-
if len(res) == 1:
119-
res = res[0]
120101
except KeyError:
121102
filepath = filepath_or_buffer if isinstance(filepath_or_buffer, HDFStore) else store.filename
122103
raise KeyError('No item with name {} has been found in file {}'.format(key, filepath))
104+
attrs = store.get_storer(key).attrs
105+
writer = attrs.writer if 'writer' in attrs else None
106+
_type = _get_type_from_attrs(attrs)
107+
_meta = attrs.metadata if 'metadata' in attrs else None
108+
if _type == 'Array':
109+
# cartesian product is not necessary if the array was written by LArray
110+
cartesian_prod = writer != 'LArray'
111+
res = df_asarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
112+
parse_header=False, cartesian_prod=cartesian_prod)
113+
if _meta is not None:
114+
res.meta = _meta
115+
elif _type == 'Axis':
116+
if name is None:
117+
name = str(pd_obj.name)
118+
if name == 'None':
119+
name = None
120+
labels = pd_obj.values
121+
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
122+
# this check is there because there are cases where dtype_kind is 'U' but pandas returns
123+
# an array with object dtype containing bytes instead of a string array, and in that case
124+
# np.char.decode does not work
125+
# this is at least the case for Python2 + Pandas 0.24.2 combination
126+
if labels.dtype.kind == 'O':
127+
labels = np.array([l.decode('utf-8') for l in labels], dtype='U')
128+
else:
129+
labels = np.char.decode(labels, 'utf-8')
130+
res = Axis(labels=labels, name=name)
131+
res._iswildcard = attrs['wildcard']
132+
elif _type == 'Group':
133+
if name is None:
134+
name = str(pd_obj.name)
135+
if name == 'None':
136+
name = None
137+
key = pd_obj.values
138+
if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
139+
key = np.char.decode(key, 'utf-8')
140+
axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
141+
res = LGroup(key=key, name=name, axis=axis)
142+
elif _type in {cls.__name__ for cls in _supported_scalars_types}:
143+
res = pd_obj.values
144+
assert len(res) == 1
145+
res = res[0]
123146
return res
124147

125148

@@ -136,46 +159,34 @@ def _open_for_write(self):
136159

137160
def list_items(self):
138161
keys = [key.strip('/') for key in self.handle.keys()]
139-
items = []
140-
# scalars
141-
items += [(key.split('/')[-1], 'scalar') for key in keys if '__scalars__' in key]
162+
items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs)) for key in keys if '/' not in key]
163+
# ---- for backward compatibility (LArray < 0.33) ----
142164
# axes
143-
items += [(key.split('/')[-1], 'Axis') for key in keys if '__axes__' in key]
165+
items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys if '__axes__' in key]
144166
# groups
145-
items += [(key.split('/')[-1], 'Group') for key in keys if '__groups__' in key]
146-
# arrays
147-
items += [(key, 'Array') for key in keys if '/' not in key]
167+
items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys if '__groups__' in key]
148168
return items
149169

150170
def _read_item(self, key, type, *args, **kwargs):
151-
if type == 'Array':
171+
if type in {cls.__name__ for cls in _hdf_supported_types}:
152172
hdf_key = '/' + key
153-
elif type == 'Axis':
173+
# ---- for backward compatibility (LArray < 0.33) ----
174+
elif type == 'Axis_Backward_Comp':
154175
hdf_key = '__axes__/' + key
155-
elif type == 'Group':
176+
elif type == 'Group_Backward_Comp':
156177
hdf_key = '__groups__/' + key
157-
elif type == 'scalar':
158-
hdf_key = '__scalars__/' + key
159178
else:
160179
raise TypeError()
161180
return read_hdf(self.handle, hdf_key, *args, **kwargs)
162181

163182
def _dump_item(self, key, value, *args, **kwargs):
164-
if isinstance(value, Array):
183+
if isinstance(value, _supported_scalars_types):
184+
value = ScalarHDF(value)
185+
elif isinstance(value, Group):
186+
kwargs['axis_key'] = '/' + value.axis.name
187+
if hasattr(value, 'to_hdf'):
165188
hdf_key = '/' + key
166189
value.to_hdf(self.handle, hdf_key, *args, **kwargs)
167-
elif isinstance(value, Axis):
168-
hdf_key = '__axes__/' + key
169-
value.to_hdf(self.handle, hdf_key, *args, **kwargs)
170-
elif isinstance(value, Group):
171-
hdf_key = '__groups__/' + key
172-
hdf_axis_key = '__axes__/' + value.axis.name
173-
value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs)
174-
elif isinstance(value, _supported_scalar_types):
175-
hdf_key = '__scalars__/' + key
176-
s = pd.Series(value)
177-
self.handle.put(hdf_key, s)
178-
self.handle.get_storer(hdf_key).attrs.type = 'scalar'
179190
else:
180191
raise TypeError()
181192

larray/inout/pickle.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99
from larray.core.metadata import Metadata
1010
from larray.util.compat import pickle
1111
from larray.inout.session import register_file_handler
12-
from larray.inout.common import FileHandler, _supported_scalar_types
12+
from larray.inout.common import FileHandler, _supported_larray_types, _supported_scalars_types
13+
14+
15+
_pickle_supported_types = _supported_larray_types + _supported_scalars_types
1316

1417

1518
@register_file_handler('pickle', ['pkl', 'pickle'])
@@ -25,8 +28,9 @@ def _open_for_write(self):
2528
self.data = OrderedDict()
2629

2730
def list_items(self):
28-
# scalars
29-
items = [(key, 'scalar') for key, value in self.data.items() if isinstance(value, _supported_scalar_types)]
31+
# scalar
32+
items = [(key, type(value).__name__) for key, value in self.data.items()
33+
if isinstance(value, _supported_scalars_types)]
3034
# axes
3135
items += [(key, 'Axis') for key, value in self.data.items() if isinstance(value, Axis)]
3236
# groups
@@ -36,13 +40,13 @@ def list_items(self):
3640
return items
3741

3842
def _read_item(self, key, type, *args, **kwargs):
39-
if type in {'Array', 'Axis', 'Group', 'scalar'}:
43+
if type in {cls.__name__ for cls in _pickle_supported_types}:
4044
return self.data[key]
4145
else:
4246
raise TypeError()
4347

4448
def _dump_item(self, key, value, *args, **kwargs):
45-
if isinstance(value, (Array, Axis, Group, _supported_scalar_types)):
49+
if isinstance(value, _pickle_supported_types):
4650
self.data[key] = value
4751
else:
4852
raise TypeError()

larray/tests/test_session.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from larray.tests.common import (assert_array_nan_equal, inputpath, tmp_path, meta,
1212
needs_xlwings, needs_pytables, needs_xlrd)
13-
from larray.inout.common import _supported_scalar_types
13+
from larray.inout.common import _supported_scalars_types
1414
from larray import (Session, Axis, Array, Group, isnan, zeros_like, ndtest, ones_like, ones, full,
1515
local_arrays, global_arrays, arrays)
1616
from larray.util.compat import pickle, PY2
@@ -180,7 +180,7 @@ def test_names(session):
180180
def _test_io(fpath, session, meta, engine):
181181
is_excel_or_csv = 'excel' in engine or 'csv' in engine
182182

183-
kind = Array if is_excel_or_csv else (Axis, Group, Array) + _supported_scalar_types
183+
kind = Array if is_excel_or_csv else (Axis, Group, Array) + _supported_scalars_types
184184
session = session.filter(kind=kind)
185185

186186
session.meta = meta
@@ -229,6 +229,7 @@ def _test_io(fpath, session, meta, engine):
229229

230230

231231
def _add_scalars_to_session(s):
232+
# 's' for scalar
232233
s['s_int'] = 5
233234
s['s_float'] = 5.5
234235
s['s_bool'] = True

0 commit comments

Comments
 (0)