>> added support for dumping/saving scalars (HDF5 + pickle formats).

alixdamman · alixdamman · commit e42fb3b19cc8 · 2020-01-15T11:16:52.000+01:00
&gt;&gt; removed pickle/hdf groups __axes__ and __groups__ since they are not required.
diff --git a/larray/core/session.py b/larray/core/session.py
@@ -394,7 +394,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
         >>> # the path of file to the Session constructor
         >>> ses = Session('input.h5')
         >>> ses
-        Session(i, s, a, b, a01, arr1, arr2)
+        Session(a, a01, arr1, arr2, b, i, s)
         >>> ses.meta
         title: my title
         author: John Smith
@@ -404,11 +404,11 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
         >>> ses = Session()
         >>> ses.load('input.h5', names=['s', 'a', 'b', 'arr1', 'arr2'], display=True)
         opening input.h5
-        loading scalar object s ... done
         loading Axis object a ... done
-        loading Axis object b ... done
         loading Array object arr1 ... done
         loading Array object arr2 ... done
+        loading Axis object b ... done
+        loading str object s ... done
 
         Using .csv files (assuming the same session as above)
 
diff --git a/larray/inout/common.py b/larray/inout/common.py
@@ -5,14 +5,17 @@
 from collections import OrderedDict
 
 from larray.util.compat import basestring
+from larray.core.axis import Axis
+from larray.core.group import Group
 from larray.core.array import Array
 
 
+# all formats
+_supported_larray_types = (Axis, Group, Array)
+
 # only for HDF5 and pickle formats
-# XXX: also include scalars in Excel and CSV formats (via __scalars__ special sheet/.csv file)?
-#      But then why saving scalars but not Axis and Group objects. That would be inconsistent.
-# XXX: support list and dict?
-_supported_scalar_types = (int, float, bool, basestring, date, time, datetime)
+# support list, tuple and dict?
+_supported_scalars_types = (int, float, bool, basestring, date, time, datetime)
 
 
 def _get_index_col(nb_axes=None, index_col=None, wide=True):
diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py
@@ -13,11 +13,36 @@
 from larray.core.metadata import Metadata
 from larray.util.misc import LHDFStore
 from larray.inout.session import register_file_handler
-from larray.inout.common import FileHandler, _supported_scalar_types
+from larray.inout.common import FileHandler, _supported_larray_types, _supported_scalars_types
 from larray.inout.pandas import df_asarray
 from larray.example import get_example_filepath
 
 
+_hdf_supported_types = _supported_larray_types + _supported_scalars_types
+
+
+class ScalarHDF(object):
+    def __init__(self, value):
+        _type = type(value).__name__
+        if not isinstance(value, _supported_scalars_types):
+            raise TypeError("Type {} which is not currently supported by the HDF5 format".format(_type))
+        self.value = value
+        self._type = _type
+
+    def to_hdf(self, filepath, key):
+        key = _translate_group_key_hdf(key)
+        s = pd.Series(data=self.value)
+        with LHDFStore(filepath) as store:
+            store.put(key, s)
+            store.get_storer(key).attrs.type = self._type
+
+
+# for backward compatibility (larray < 0.29) but any object read from an hdf file should have
+# an attribute 'type'
+def _get_type_from_attrs(attrs):
+    return attrs.type if 'type' in attrs else 'Array'
+
+
 def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False,
              name=None, **kwargs):
     r"""Reads a scalar or an axis or group or array named key from a HDF5 file in filepath (path+name)
@@ -73,53 +98,51 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
     with LHDFStore(filepath_or_buffer) as store:
         try:
             pd_obj = store.get(key)
-            attrs = store.get_storer(key).attrs
-            writer = attrs.writer if 'writer' in attrs else None
-            # for backward compatibility but any object read from an hdf file should have an attribute 'type'
-            _type = attrs.type if 'type' in attrs else 'Array'
-            _meta = attrs.metadata if 'metadata' in attrs else None
-            if _type == 'Array':
-                # cartesian product is not necessary if the array was written by LArray
-                cartesian_prod = writer != 'LArray'
-                res = df_asarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
-                                 parse_header=False, cartesian_prod=cartesian_prod)
-                if _meta is not None:
-                    res.meta = _meta
-            elif _type == 'Axis':
-                if name is None:
-                    name = str(pd_obj.name)
-                if name == 'None':
-                    name = None
-                labels = pd_obj.values
-                if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
-                    # this check is there because there are cases where dtype_kind is 'U' but pandas returns
-                    # an array with object dtype containing bytes instead of a string array, and in that case
-                    # np.char.decode does not work
-                    # this is at least the case for Python2 + Pandas 0.24.2 combination
-                    if labels.dtype.kind == 'O':
-                        labels = np.array([l.decode('utf-8') for l in labels], dtype='U')
-                    else:
-                        labels = np.char.decode(labels, 'utf-8')
-                res = Axis(labels=labels, name=name)
-                res._iswildcard = attrs['wildcard']
-            elif _type == 'Group':
-                if name is None:
-                    name = str(pd_obj.name)
-                if name == 'None':
-                    name = None
-                key = pd_obj.values
-                if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
-                    key = np.char.decode(key, 'utf-8')
-                axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
-                res = LGroup(key=key, name=name, axis=axis)
-            elif _type == 'scalar':
-                res = pd_obj.values
-                # XXX : assert len(res) == 1 ?
-                if len(res) == 1:
-                    res = res[0]
         except KeyError:
             filepath = filepath_or_buffer if isinstance(filepath_or_buffer, HDFStore) else store.filename
             raise KeyError('No item with name {} has been found in file {}'.format(key, filepath))
+        attrs = store.get_storer(key).attrs
+        writer = attrs.writer if 'writer' in attrs else None
+        _type = _get_type_from_attrs(attrs)
+        _meta = attrs.metadata if 'metadata' in attrs else None
+        if _type == 'Array':
+            # cartesian product is not necessary if the array was written by LArray
+            cartesian_prod = writer != 'LArray'
+            res = df_asarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
+                             parse_header=False, cartesian_prod=cartesian_prod)
+            if _meta is not None:
+                res.meta = _meta
+        elif _type == 'Axis':
+            if name is None:
+                name = str(pd_obj.name)
+            if name == 'None':
+                name = None
+            labels = pd_obj.values
+            if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
+                # this check is there because there are cases where dtype_kind is 'U' but pandas returns
+                # an array with object dtype containing bytes instead of a string array, and in that case
+                # np.char.decode does not work
+                # this is at least the case for Python2 + Pandas 0.24.2 combination
+                if labels.dtype.kind == 'O':
+                    labels = np.array([l.decode('utf-8') for l in labels], dtype='U')
+                else:
+                    labels = np.char.decode(labels, 'utf-8')
+            res = Axis(labels=labels, name=name)
+            res._iswildcard = attrs['wildcard']
+        elif _type == 'Group':
+            if name is None:
+                name = str(pd_obj.name)
+            if name == 'None':
+                name = None
+            key = pd_obj.values
+            if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
+                key = np.char.decode(key, 'utf-8')
+            axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
+            res = LGroup(key=key, name=name, axis=axis)
+        elif _type in {cls.__name__ for cls in _supported_scalars_types}:
+            res = pd_obj.values
+            assert len(res) == 1
+            res = res[0]
     return res
 
 
@@ -136,46 +159,34 @@ def _open_for_write(self):
 
     def list_items(self):
         keys = [key.strip('/') for key in self.handle.keys()]
-        items = []
-        # scalars
-        items += [(key.split('/')[-1], 'scalar') for key in keys if '__scalars__' in key]
+        items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs)) for key in keys if '/' not in key]
+        # ---- for backward compatibility (LArray < 0.33) ----
         # axes
-        items += [(key.split('/')[-1], 'Axis') for key in keys if '__axes__' in key]
+        items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys if '__axes__' in key]
         # groups
-        items += [(key.split('/')[-1], 'Group') for key in keys if '__groups__' in key]
-        # arrays
-        items += [(key, 'Array') for key in keys if '/' not in key]
+        items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys if '__groups__' in key]
         return items
 
     def _read_item(self, key, type, *args, **kwargs):
-        if type == 'Array':
+        if type in {cls.__name__ for cls in _hdf_supported_types}:
             hdf_key = '/' + key
-        elif type == 'Axis':
+        # ---- for backward compatibility (LArray < 0.33) ----
+        elif type == 'Axis_Backward_Comp':
             hdf_key = '__axes__/' + key
-        elif type == 'Group':
+        elif type == 'Group_Backward_Comp':
             hdf_key = '__groups__/' + key
-        elif type == 'scalar':
-            hdf_key = '__scalars__/' + key
         else:
             raise TypeError()
         return read_hdf(self.handle, hdf_key, *args, **kwargs)
 
     def _dump_item(self, key, value, *args, **kwargs):
-        if isinstance(value, Array):
+        if isinstance(value, _supported_scalars_types):
+            value = ScalarHDF(value)
+        elif isinstance(value, Group):
+            kwargs['axis_key'] = '/' + value.axis.name
+        if hasattr(value, 'to_hdf'):
             hdf_key = '/' + key
             value.to_hdf(self.handle, hdf_key, *args, **kwargs)
-        elif isinstance(value, Axis):
-            hdf_key = '__axes__/' + key
-            value.to_hdf(self.handle, hdf_key, *args, **kwargs)
-        elif isinstance(value, Group):
-            hdf_key = '__groups__/' + key
-            hdf_axis_key = '__axes__/' + value.axis.name
-            value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs)
-        elif isinstance(value, _supported_scalar_types):
-            hdf_key = '__scalars__/' + key
-            s = pd.Series(value)
-            self.handle.put(hdf_key, s)
-            self.handle.get_storer(hdf_key).attrs.type = 'scalar'
         else:
             raise TypeError()
 
diff --git a/larray/inout/pickle.py b/larray/inout/pickle.py
@@ -9,7 +9,10 @@
 from larray.core.metadata import Metadata
 from larray.util.compat import pickle
 from larray.inout.session import register_file_handler
-from larray.inout.common import FileHandler, _supported_scalar_types
+from larray.inout.common import FileHandler, _supported_larray_types, _supported_scalars_types
+
+
+_pickle_supported_types = _supported_larray_types + _supported_scalars_types
 
 
 @register_file_handler('pickle', ['pkl', 'pickle'])
@@ -25,8 +28,9 @@ def _open_for_write(self):
             self.data = OrderedDict()
 
     def list_items(self):
-        # scalars
-        items = [(key, 'scalar') for key, value in self.data.items() if isinstance(value, _supported_scalar_types)]
+        # scalar
+        items = [(key, type(value).__name__) for key, value in self.data.items()
+                 if isinstance(value, _supported_scalars_types)]
         # axes
         items += [(key, 'Axis') for key, value in self.data.items() if isinstance(value, Axis)]
         # groups
@@ -36,13 +40,13 @@ def list_items(self):
         return items
 
     def _read_item(self, key, type, *args, **kwargs):
-        if type in {'Array', 'Axis', 'Group', 'scalar'}:
+        if type in {cls.__name__ for cls in _pickle_supported_types}:
             return self.data[key]
         else:
             raise TypeError()
 
     def _dump_item(self, key, value, *args, **kwargs):
-        if isinstance(value, (Array, Axis, Group, _supported_scalar_types)):
+        if isinstance(value, _pickle_supported_types):
             self.data[key] = value
         else:
             raise TypeError()
diff --git a/larray/tests/test_session.py b/larray/tests/test_session.py
@@ -10,7 +10,7 @@
 
 from larray.tests.common import (assert_array_nan_equal, inputpath, tmp_path, meta,
                                  needs_xlwings, needs_pytables, needs_xlrd)
-from larray.inout.common import _supported_scalar_types
+from larray.inout.common import _supported_scalars_types
 from larray import (Session, Axis, Array, Group, isnan, zeros_like, ndtest, ones_like, ones, full,
                     local_arrays, global_arrays, arrays)
 from larray.util.compat import pickle, PY2
@@ -180,7 +180,7 @@ def test_names(session):
 def _test_io(fpath, session, meta, engine):
     is_excel_or_csv = 'excel' in engine or 'csv' in engine
 
-    kind = Array if is_excel_or_csv else (Axis, Group, Array) + _supported_scalar_types
+    kind = Array if is_excel_or_csv else (Axis, Group, Array) + _supported_scalars_types
     session = session.filter(kind=kind)
 
     session.meta = meta
@@ -229,6 +229,7 @@ def _test_io(fpath, session, meta, engine):
 
 
 def _add_scalars_to_session(s):
+    # 's' for scalar
     s['s_int'] = 5
     s['s_float'] = 5.5
     s['s_bool'] = True