From 49d02ffcaaef3207104e258403d426814f36bb9d Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 5 Aug 2019 12:01:40 +0200 Subject: [PATCH 1/3] fix #788 : convert unicode string to bytes and reversely when dumping/loading Axis and Group objects to/from HDF files (to avoid huge size hdf files) --- larray/core/axis.py | 5 ++++- larray/core/group.py | 7 ++++++- larray/inout/hdf.py | 10 ++++++++-- larray/tests/test_axis.py | 5 +++++ larray/tests/test_group.py | 5 +++++ 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 2d8f81baa..2d4e689b9 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1349,10 +1349,13 @@ def to_hdf(self, filepath, key=None): raise ValueError("Argument key must be provided explicitly in case of anonymous axis") key = self.name key = _translate_group_key_hdf(key) - s = pd.Series(data=self.labels, name=self.name) + kind = self.labels.dtype.kind + data = np.char.encode(self.labels, 'utf-8') if kind == 'U' else self.labels + s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: store.put(key, s) store.get_storer(key).attrs.type = 'Axis' + store.get_storer(key).attrs.kind = kind store.get_storer(key).attrs.wildcard = self.iswildcard @property diff --git a/larray/core/group.py b/larray/core/group.py index 52030d474..b1739bbb7 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -1462,10 +1462,15 @@ def to_hdf(self, filepath, key=None, axis_key=None): if self.axis.name is None: raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous") axis_key = self.axis.name - s = pd.Series(data=self.eval(), name=self.name) + data = self.eval() + kind = data.dtype.kind if isinstance(data, np.ndarray) else '' + if kind == 'U': + data = np.char.encode(data, 'utf-8') + s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: store.put(key, s) store.get_storer(key).attrs.type = 'Group' + store.get_storer(key).attrs.kind = kind if axis_key not in store: self.axis.to_hdf(store, key=axis_key) store.get_storer(key).attrs.axis_key = axis_key diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 025498add..ce192413d 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -88,15 +88,21 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s name = str(pd_obj.name) if name == 'None': name = None - res = Axis(labels=pd_obj.values, name=name) + labels = pd_obj.values + if 'kind' in attrs and attrs['kind'] == 'U': + labels = np.char.decode(labels, 'utf-8') + res = Axis(labels=labels, name=name) res._iswildcard = attrs['wildcard'] elif _type == 'Group': if name is None: name = str(pd_obj.name) if name == 'None': name = None + key = pd_obj.values + if 'kind' in attrs and attrs['kind'] == 'U': + key = np.char.decode(key, 'utf-8') axis = read_hdf(filepath_or_buffer, attrs['axis_key']) - res = LGroup(key=pd_obj.values, name=name, axis=axis) + res = LGroup(key=key, name=name, axis=axis) return res diff --git a/larray/tests/test_axis.py b/larray/tests/test_axis.py index 87ed1de75..bf4bbd821 100644 --- a/larray/tests/test_axis.py +++ b/larray/tests/test_axis.py @@ -391,6 +391,7 @@ def test_h5_io(tmpdir): lipro = Axis('lipro=P01..P05') anonymous = Axis(range(3)) wildcard = Axis(3, 'wildcard') + string_axis = Axis(['@!àéè&%µ$~', '/*-+_§()><', 'another label'], 'string_axis') fpath = os.path.join(str(tmpdir), 'axes.h5') # ---- default behavior ---- @@ -410,6 +411,10 @@ def test_h5_io(tmpdir): wildcard2 = read_hdf(fpath, key=wildcard.name) assert wildcard2.iswildcard assert wildcard.equals(wildcard2) + # string axis + string_axis.to_hdf(fpath) + string_axis2 = read_hdf(fpath, string_axis.name) + assert string_axis.equals(string_axis2) # ---- specific key ---- # int axis diff --git a/larray/tests/test_group.py b/larray/tests/test_group.py index fba0c5ad6..7fe44a08d 100644 --- a/larray/tests/test_group.py +++ b/larray/tests/test_group.py @@ -192,6 +192,7 @@ def test_h5_io_lgroup(tmpdir): named_axis_not_in_file = lipro['P01,P03,P05'] >> 'P_odd' anonymous = age[':5'] wildcard = age_wildcard[':5'] >> 'age_w_05' + string_group = Axis(['@!àéè&%µ$~', '/*-+_§()><', 'another label'], 'string_axis')[:] >> 'string_group' # ---- default behavior ---- # named group @@ -209,6 +210,10 @@ def test_h5_io_lgroup(tmpdir): named_axis_not_in_file.to_hdf(fpath) named2 = read_hdf(fpath, key=named_axis_not_in_file.name) assert all(named_axis_not_in_file == named2) + # string group + string_group.to_hdf(fpath) + string_group2 = read_hdf(fpath, key=string_group.name) + assert all(string_group == string_group2) # ---- specific hdf group + key ---- hdf_group = 'my_groups' From a874db5cc31b2307af06cb8c3abb85dadb294e28 Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 5 Aug 2019 13:27:48 +0200 Subject: [PATCH 2/3] renamed attribute kind as dtype_kind --- larray/core/axis.py | 6 +++--- larray/core/group.py | 6 +++--- larray/inout/hdf.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 2d4e689b9..5ae80ab03 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1349,13 +1349,13 @@ def to_hdf(self, filepath, key=None): raise ValueError("Argument key must be provided explicitly in case of anonymous axis") key = self.name key = _translate_group_key_hdf(key) - kind = self.labels.dtype.kind - data = np.char.encode(self.labels, 'utf-8') if kind == 'U' else self.labels + dtype_kind = self.labels.dtype.kind + data = np.char.encode(self.labels, 'utf-8') if dtype_kind == 'U' else self.labels s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: store.put(key, s) store.get_storer(key).attrs.type = 'Axis' - store.get_storer(key).attrs.kind = kind + store.get_storer(key).attrs.dtype_kind = dtype_kind store.get_storer(key).attrs.wildcard = self.iswildcard @property diff --git a/larray/core/group.py b/larray/core/group.py index b1739bbb7..6d93c9e9f 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -1463,14 +1463,14 @@ def to_hdf(self, filepath, key=None, axis_key=None): raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous") axis_key = self.axis.name data = self.eval() - kind = data.dtype.kind if isinstance(data, np.ndarray) else '' - if kind == 'U': + dtype_kind = data.dtype.kind if isinstance(data, np.ndarray) else '' + if dtype_kind == 'U': data = np.char.encode(data, 'utf-8') s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: store.put(key, s) store.get_storer(key).attrs.type = 'Group' - store.get_storer(key).attrs.kind = kind + store.get_storer(key).attrs.dtype_kind = dtype_kind if axis_key not in store: self.axis.to_hdf(store, key=axis_key) store.get_storer(key).attrs.axis_key = axis_key diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index ce192413d..09ea6d0c7 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -89,7 +89,7 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s if name == 'None': name = None labels = pd_obj.values - if 'kind' in attrs and attrs['kind'] == 'U': + if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': labels = np.char.decode(labels, 'utf-8') res = Axis(labels=labels, name=name) res._iswildcard = attrs['wildcard'] @@ -99,7 +99,7 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s if name == 'None': name = None key = pd_obj.values - if 'kind' in attrs and attrs['kind'] == 'U': + if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': key = np.char.decode(key, 'utf-8') axis = read_hdf(filepath_or_buffer, attrs['axis_key']) res = LGroup(key=key, name=name, axis=axis) From 11a1c6f78d4162906622edecffe13926275bb73e Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 5 Aug 2019 13:29:17 +0200 Subject: [PATCH 3/3] specified encoding in test_xxx.py modules (for Python 2) --- larray/tests/test_array.py | 1 + larray/tests/test_axis.py | 2 ++ larray/tests/test_group.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 4d669d2ca..268e73f72 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -1,3 +1,4 @@ +# -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function import os diff --git a/larray/tests/test_axis.py b/larray/tests/test_axis.py index bf4bbd821..1410e9e26 100644 --- a/larray/tests/test_axis.py +++ b/larray/tests/test_axis.py @@ -1,4 +1,6 @@ +# -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function + import pytest import os.path import numpy as np diff --git a/larray/tests/test_group.py b/larray/tests/test_group.py index 7fe44a08d..9fd582bab 100644 --- a/larray/tests/test_group.py +++ b/larray/tests/test_group.py @@ -1,4 +1,6 @@ +# -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function + import pytest import os.path import numpy as np