Skip to content
9 changes: 8 additions & 1 deletion doc/source/changes/version_0_32.rst.inc
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,11 @@ Miscellaneous improvements
Fixes
^^^^^

* fixed something (closes :issue:`1`).
* fixed reading/exporting sessions containing two or more axes/groups
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The work you have put into this PR makes me wonder even more whether saving sessions (and especially when using non LArray objects) to CSV and to Excel, are worth it. I fear they will give us an endless stream of problems for little benefit.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No one noticed the bug meaning no one will notice if we drop the ability to save and load axes and groups with the CSV or Excel format.
So, do I edit the title of the corresponding issue and drop Axis and Group objects when calling Session.save()/load() ?

Copy link
Contributor

@gdementen gdementen Sep 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure where to draw the line/what's best:

  1. support saving axes and groups (and possibly arrays with metadata) only to HDF (or other formats where we can store arbitrary things). In that case, we will need to output a warning when saving a session containing them to csv/excel. Or we could go the GIMP route and say there is a single "native" format (HDF or .la or whatever) and there are other formats we can export to, each with their own limitations.
  2. only support saving sessions to HDF-like formats.
  3. keep trying to support everything for all formats. Let's be clear: it would love to have that, but I fear the time spent on that could be better spent elsewhere.
  4. possibly another intermediate option I am not seeing?

I think we need to discuss this face to face, as it will be easier.

with the same name (or anonymous) from/to CSV, Excel and HDF files (closes :issue:`803`).

* fixed NaNs and None labels appearing in axes and groups when reading/exporting sessions
from/to CSV and Excel files (closes :issue:`804`).

* fixed importing/exporting anonymous and/or wildcard axes to CSV and Excel files
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was the problem present only when importing/exporting the axes objects themselves or arrays with such axes, or both?

(closes :issue:`805`).
15 changes: 7 additions & 8 deletions larray/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,22 +1185,21 @@ def to_frame(self, fold_last_axis_name=False, dropna=None):
b1 6 7
"""
columns = pd.Index(self.axes[-1].labels)
axes_names = self.axes.display_names[:]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure using display_names here is a good idea? You'll get * in axes names for wildcard axes and {axis_pos} for anonymous axes...

I am very nervous about this change. I fear it must break "something" somewhere given that to_frame is used in a lot of places (at least indirectly). Pandas dataframes with "{0}*" as explicit name would be ugly, right?
As will be plots with such labels.

if not fold_last_axis_name:
columns.name = self.axes[-1].name
columns.name = axes_names[-1]
if self.ndim > 1:
axes_names = self.axes.names[:-1]
_axes_names = axes_names[:-1]
if fold_last_axis_name:
tmp = axes_names[-1] if axes_names[-1] is not None else ''
if self.axes[-1].name:
axes_names[-1] = "{}\\{}".format(tmp, self.axes[-1].name)
_axes_names[-1] = "{}\\{}".format(_axes_names[-1], axes_names[-1])
if self.ndim == 2:
index = pd.Index(data=self.axes[0].labels, name=axes_names[0])
index = pd.Index(data=self.axes[0].labels, name=_axes_names[0])
else:
index = pd.MultiIndex.from_product(self.axes.labels[:-1], names=axes_names)
index = pd.MultiIndex.from_product(self.axes.labels[:-1], names=_axes_names)
else:
index = pd.Index([''])
if fold_last_axis_name:
index.name = self.axes.names[-1]
index.name = axes_names[-1]
data = np.asarray(self).reshape(len(index), len(columns))
df = pd.DataFrame(data, index, columns)
if dropna is not None:
Expand Down
10 changes: 5 additions & 5 deletions larray/inout/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,15 +329,15 @@ def list_items(self):
pass
try:
fnames.remove('__axes__')
items = [(name, 'Axis') for name in sorted(self.axes.keys())]
items = [(key, 'Axis') for key in sorted(self.axes.keys())]
except:
pass
try:
fnames.remove('__groups__')
items += [(name, 'Group') for name in sorted(self.groups.keys())]
items += [(key, 'Group') for key in sorted(self.groups.keys())]
except:
pass
items += [(name, 'Array') for name in fnames]
items += [(key, 'Array') for key in fnames]
return items

def _read_item(self, key, type, *args, **kwargs):
Expand Down Expand Up @@ -375,10 +375,10 @@ def _dump_metadata(self, metadata):

def save(self):
if len(self.axes) > 0:
df = _axes_to_df(self.axes.values())
df = _axes_to_df(self.axes)
df.to_csv(self._to_filepath('__axes__'), sep=self.sep, index=False)
if len(self.groups) > 0:
df = _groups_to_df(self.groups.values())
df = _groups_to_df(self.groups)
df.to_csv(self._to_filepath('__groups__'), sep=self.sep, index=False)

def close(self):
Expand Down
20 changes: 10 additions & 10 deletions larray/inout/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,15 +268,15 @@ def list_items(self):
pass
try:
sheet_names.remove('__axes__')
items = [(name, 'Axis') for name in sorted(self.axes.keys())]
items = [(key, 'Axis') for key in sorted(self.axes.keys())]
except:
pass
try:
sheet_names.remove('__groups__')
items += [(name, 'Group') for name in sorted(self.groups.keys())]
items += [(key, 'Group') for key in sorted(self.groups.keys())]
except:
pass
items += [(name, 'Array') for name in sheet_names]
items += [(key, 'Array') for key in sheet_names]
return items

def _read_item(self, key, type, *args, **kwargs):
Expand Down Expand Up @@ -316,10 +316,10 @@ def _dump_metadata(self, metadata):

def save(self):
if len(self.axes) > 0:
df = _axes_to_df(self.axes.values())
df = _axes_to_df(self.axes)
df.to_excel(self.handle, '__axes__', index=False, engine='xlsxwriter')
if len(self.groups) > 0:
df = _groups_to_df(self.groups.values())
df = _groups_to_df(self.groups)
df.to_excel(self.handle, '__groups__', index=False, engine='xlsxwriter')

def close(self):
Expand Down Expand Up @@ -373,15 +373,15 @@ def list_items(self):
pass
try:
sheet_names.remove('__axes__')
items = [(name, 'Axis') for name in sorted(self.axes.keys())]
items = [(key, 'Axis') for key in sorted(self.axes.keys())]
except:
pass
try:
sheet_names.remove('__groups__')
items += [(name, 'Group') for name in sorted(self.groups.keys())]
items += [(key, 'Group') for key in sorted(self.groups.keys())]
except:
pass
items += [(name, 'Array') for name in sheet_names]
items += [(key, 'Array') for key in sheet_names]
return items

def _read_item(self, key, type, *args, **kwargs):
Expand Down Expand Up @@ -419,11 +419,11 @@ def _dump_metadata(self, metadata):

def save(self):
if len(self.axes) > 0:
df = _axes_to_df(self.axes.values())
df = _axes_to_df(self.axes)
self.handle['__axes__'] = ''
self.handle['__axes__'][:].options(pd.DataFrame, index=False).value = df
if len(self.groups) > 0:
df = _groups_to_df(self.groups.values())
df = _groups_to_df(self.groups)
self.handle['__groups__'] = ''
self.handle['__groups__'][:].options(pd.DataFrame, index=False).value = df
self.handle.save()
Expand Down
2 changes: 0 additions & 2 deletions larray/inout/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,8 @@ def _read_item(self, key, type, *args, **kwargs):
hdf_key = '/' + key
elif type == 'Axis':
hdf_key = '__axes__/' + key
kwargs['name'] = key
elif type == 'Group':
hdf_key = '__groups__/' + key
kwargs['name'] = key
else:
raise TypeError()
return read_hdf(self.handle, hdf_key, *args, **kwargs)
Expand Down
81 changes: 62 additions & 19 deletions larray/inout/pandas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import, print_function

import re
from itertools import product
from collections import OrderedDict

Expand Down Expand Up @@ -140,6 +141,9 @@ def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs):
return LArray(s.values, Axis(s.index.values, name), meta=meta)


_anonymous_axis_pattern = re.compile(r'\{(\d+|\??)\}\*?')


def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False,
fill_value=nan, meta=None, cartesian_prod=True, **kwargs):
r"""
Expand Down Expand Up @@ -247,7 +251,16 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
axes_names = [str(name) if name is not None else name
for name in axes_names]

axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
def _to_axis(labels, name):
if name is not None:
if name[-1] == '*':
labels = len(labels)
name = name[:-1]
if _anonymous_axis_pattern.match(name):
name = None
return Axis(labels, name)

axes = AxisCollection([_to_axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
data = df.values.reshape(axes.shape)
return LArray(data, axes, meta=meta)

Expand Down Expand Up @@ -338,51 +351,81 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
# SERIES <--> AXIS, GROUP, META #
# #################################### #

def _axis_to_series(axis, dtype=None):
return pd.Series(data=axis.labels, name=str(axis), dtype=dtype)
def _extract_labels_from_series(series):
# remove trailing NaN or None values
# (multiple Axis or Group objects of different lengths
# are stored in the same DataFrame leading to trailing
# NaNs or None values when split into series)
series = series.loc[:series.last_valid_index()]

labels = np.asarray(series.values)
# integer labels of axes or groups may have been converted to float values
# because of trailing NaNs
if labels.dtype.kind == 'f' and all([label.is_integer() for label in labels]):
labels = labels.astype(int)
# if dtype is still object, we assume values are strings
if labels.dtype.kind == 'O':
labels = labels.astype(str)
return labels


def _axis_to_series(key, axis, dtype=None):
name = '{}:{}'.format(key, str(axis))
labels = len(axis) if axis.iswildcard else axis.labels
return pd.Series(data=labels, name=name, dtype=dtype)


def _series_to_axis(series):
return Axis(labels=series.values, name=series.name)
name = str(series.name)
labels = _extract_labels_from_series(series)
if ':' in name:
key, axis_name = name.split(':')
if axis_name[-1] == '*':
labels = labels[0]
if _anonymous_axis_pattern.match(axis_name):
axis_name = None
else:
# for backward compatibility
key = axis_name = name
return key, Axis(labels=labels, name=axis_name)


def _group_to_series(group, dtype=None):
name = group.name if group.name is not None else '{?}'
def _group_to_series(key, group, dtype=None):
if group.axis.name is None:
raise ValueError("Cannot save a group with an anonymous associated axis")
name += '@{}'.format(group.axis.name)
name = '{}:{}@{}'.format(key, group.name, group.axis.name)
return pd.Series(data=group.eval(), name=name, dtype=dtype)


def _series_to_group(series, axis):
name = series.name.split('@')[0]
return LGroup(key=series.values, name=name, axis=axis)
def _series_to_group(series, axes):
key, name = str(series.name).split(':')
group_name, axis_name = name.split('@')
if group_name == 'None':
group_name = None
axis = axes[axis_name]
group_key = _extract_labels_from_series(series)
return key, LGroup(key=group_key, name=group_name, axis=axis)


# ######################################## #
# DATAFRAME <--> AXES, GROUPS, META #
# ######################################## #

def _df_to_axes(df):
return OrderedDict([(col_name, _series_to_axis(df[col_name])) for col_name in df.columns.values])
return OrderedDict([_series_to_axis(df[col_name]) for col_name in df.columns.values])


def _axes_to_df(axes):
# set dtype to np.object otherwise pd.concat below may convert an int row/column as float
# if trailing NaN need to be added
return pd.concat([_axis_to_series(axis, dtype=np.object) for axis in axes], axis=1)
return pd.concat([_axis_to_series(key, axis, dtype=np.object) for key, axis in axes.items()], axis=1)


def _df_to_groups(df, axes):
groups = OrderedDict()
for name, values in df.iteritems():
group_name, axis_name = name.split('@')
axis = axes[axis_name]
groups[group_name] = _series_to_group(values, axis)
return groups
return OrderedDict([_series_to_group(df[col_name], axes) for col_name in df.columns.values])


def _groups_to_df(groups):
# set dtype to np.object otherwise pd.concat below may convert an int row/column as float
# if trailing NaN need to be added
return pd.concat([_group_to_series(group, dtype=np.object) for group in groups], axis=1)
return pd.concat([_group_to_series(key, group, dtype=np.object) for key, group in groups.items()], axis=1)
23 changes: 23 additions & 0 deletions larray/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3454,6 +3454,17 @@ def test_from_series():
assert_array_equal(res, expected)


def test_to_frame():
# array containing anonymous axes
arr = ndtest((Axis(2), Axis(2), Axis(2)))
df = arr.to_frame()
assert df.index.name is None
assert df.index.names == ['{0}*', '{1}*']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it really what we want???

assert df.columns.name == '{2}*'
assert list(df.index.values) == [(0, 0), (0, 1), (1, 0), (1, 1)]
assert list(df.columns.values) == [0, 1]


def test_from_frame():
# 1) data = scalar
# ================
Expand Down Expand Up @@ -3816,6 +3827,18 @@ def test_from_frame():
assert la.axes.names == ['age', 'sex', 'time']
assert_array_equal(la[0, 'F', :], [3722, 3395, 3347])

# 3C) 3 anonymous axes
# ====================
arr = ndtest((Axis(2), Axis(2), Axis(2)))
df = arr.to_frame()

la = from_frame(df)
assert la.ndim == 3
assert la.shape == (2, 2, 2)
for axis in la.axes:
assert axis.name is None
assert axis.iswildcard

# 4) test sort_rows and sort_columns arguments
# ============================================
age = Axis('age=2,0,1,3')
Expand Down
Loading