Skip to content

Commit 5a365d0

Browse files
committed
fix #830 and #831: fixed loading files with several index columns (axes) with the same name (including an empty name)
also improved roundtrip between larray and pandas objects with no name (None as the name). To make the transition as faithful as possible, '' should only be replaced by None, and other non string names converted to strings, in methods used during loading *text*-based formats (df_asarray), not in all dataframe importations (from_frame). Note that the symmetric change (to replace None by '' when writing text-based formats) is AFAICT not necessary because it happens automatically in both xlwings and Dataframe.to_csv
1 parent e3fd35f commit 5a365d0

File tree

5 files changed

+120
-23
lines changed

5 files changed

+120
-23
lines changed

doc/source/changes/version_0_32_1.rst.inc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.. py:currentmodule:: larray
1+
.. py:currentmodule:: larray
22

33

44
Miscellaneous improvements
@@ -10,4 +10,5 @@ Miscellaneous improvements
1010
Fixes
1111
^^^^^
1212

13-
* fixed something (closes :issue:`1`).
13+
* fixed loading arrays with more than 2 dimensions but no axes names (even when specifying nb_axes explicitly). This
14+
case mostly occurs when trying to load a specific range of an Excel file (closes :issue:`830` and :issue:`831`).

larray/core/array.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2544,14 +2544,22 @@ def dump(self, header=True, wide=True, value_name='value', light=False, axes_nam
25442544
if _axes_display_names:
25452545
axes_names = self.axes.display_names[:]
25462546
else:
2547-
axes_names = [axis_name if axis_name is not None else '' for axis_name in self.axes.names]
2547+
axes_names = self.axes.names
25482548

25492549
# transforms ['a', 'b', 'c', 'd'] into ['a', 'b', 'c\\d']
25502550
if wide and len(axes_names) > 1:
25512551
if dump_axes_names is True:
2552-
separator = '\\' if axes_names[-1] else ''
2553-
axes_names[-2] = separator.join(axes_names[-2:])
2554-
axes_names.pop()
2552+
# combine two last names
2553+
last_name = axes_names.pop()
2554+
prev_name = axes_names[-1]
2555+
# do not combine if last_name is None or ''
2556+
if last_name:
2557+
prev_name = prev_name if prev_name is not None else ''
2558+
combined_name = prev_name + '\\' + last_name
2559+
else:
2560+
# whether it is a string or None !
2561+
combined_name = prev_name
2562+
axes_names[-1] = combined_name
25552563
elif dump_axes_names == 'except_last':
25562564
axes_names = axes_names[:-1]
25572565
else:
@@ -8639,6 +8647,18 @@ def index_if_exists(a, axis, i):
86398647
return a[a_axis[axis.labels[i]]]
86408648
else:
86418649
return a
8650+
# CHECK: try something like:
8651+
# def index_if_exists(a, igroup):
8652+
# axis = igroup.axis
8653+
# if isinstance(a, Array) and axis in a.axes:
8654+
# a_axis = a.axes[axis]
8655+
# return a[a_axis[axis.labels[i]]]
8656+
# else:
8657+
# return a
8658+
# for i in axis.i[1:]:
8659+
# i_mult = index_if_exists(mult, i)
8660+
# i_inc = index_if_exists(inc, i)
8661+
# res[i] = res[i - 1] * i_mult + i_inc
86428662
for i in range(1, len(axis)):
86438663
i_mult = index_if_exists(mult, axis, i)
86448664
i_inc = index_if_exists(inc, axis, i)

larray/inout/misc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from larray.util.misc import deprecate_kwarg
77
from larray.util.compat import StringIO
88
from larray.inout.common import _get_index_col
9-
from larray.inout.pandas import df_asarray
9+
from larray.inout.pandas import df_asarray, set_dataframe_index_by_position
1010
from larray.inout.csv import read_csv
1111

1212

@@ -112,7 +112,7 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=nan, sort_rows=Fal
112112

113113
df = DataFrame(data[1:], columns=data[0])
114114
if index_col is not None:
115-
df.set_index([df.columns[c] for c in index_col], inplace=True)
115+
df = set_dataframe_index_by_position(df, index_col)
116116

117117
return df_asarray(df, raw=index_col is None, parse_header=False, sort_rows=sort_rows, sort_columns=sort_columns,
118118
fill_value=fill_value, wide=wide)

larray/inout/pandas.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def index_to_labels(idx, sort=True):
4343
if sort:
4444
return list(idx.levels)
4545
else:
46-
return [list(unique(idx.get_level_values(l))) for l in idx.names]
46+
return [list(unique(idx.get_level_values(l))) for l in range(idx.nlevels)]
4747
else:
4848
assert isinstance(idx, pd.core.index.Index)
4949
labels = list(idx.values)
@@ -225,7 +225,6 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
225225
if unfold_last_axis_name:
226226
if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]:
227227
last_axes = [name.strip() for name in axes_names[-1].split('\\')]
228-
last_axes = [name if name else None for name in last_axes]
229228
axes_names = axes_names[:-1] + last_axes
230229
else:
231230
axes_names += [None]
@@ -244,14 +243,32 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
244243
# Pandas treats column labels as column names (strings) so we need to convert them to values
245244
last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
246245
axes_labels.append(last_axis_labels)
247-
axes_names = [str(name) if name is not None else name
248-
for name in axes_names]
249246

250247
axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
251248
data = df.values.reshape(axes.shape)
252249
return Array(data, axes, meta=meta)
253250

254251

252+
def set_dataframe_index_by_position(df, index_col_indices):
253+
"""
254+
equivalent to Dataframe.set_index but with column indices, not column labels
255+
256+
This is necessary to support creating an index from columns without a name or with duplicate names.
257+
258+
Returns a new Dataframe
259+
"""
260+
if not isinstance(index_col_indices, list):
261+
index_col_indices = [index_col_indices]
262+
index_col_indices_set = set(index_col_indices)
263+
index_col_values = [df.iloc[:, i] for i in index_col_indices]
264+
non_index_col_indices = [i for i in range(len(df.columns)) if i not in index_col_indices_set]
265+
# drop the index columns from the "normal" columns of the dataframe
266+
df = df.iloc[:, non_index_col_indices]
267+
# add them back as index columns
268+
df.set_index(index_col_values, inplace=True)
269+
return df
270+
271+
255272
def df_asarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, cartesian_prod=True,
256273
**kwargs):
257274
r"""
@@ -307,12 +324,10 @@ def df_asarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=
307324

308325
# This is required to handle int column names (otherwise we can simply use column positions in set_index).
309326
# This is NOT the same as df.columns[list(range(...))] !
310-
index_columns = [df.columns[i] for i in range(pos_last + 1)]
311-
df.set_index(index_columns, inplace=True)
327+
df = set_dataframe_index_by_position(df, list(range(pos_last + 1)))
312328
else:
313-
index_columns = [df.columns[i] for i in range(len(df.columns) - 1)]
314-
df.set_index(index_columns, inplace=True)
315-
series = df[df.columns[-1]]
329+
df = set_dataframe_index_by_position(df, list(range(len(df.columns) - 1)))
330+
series = df.iloc[:, -1]
316331
series.name = df.index.name
317332
return from_series(series, sort_rows=sort_columns, **kwargs)
318333

@@ -339,6 +354,10 @@ def parse_axis_name(name):
339354
unfold_last_axis_name=unfold_last_axis_name, cartesian_prod=cartesian_prod, **kwargs)
340355

341356
# ugly hack to avoid anonymous axes converted as axes with name 'Unnamed: x' by pandas
357+
# we also take the opportunity to change axes with empty name to real anonymous axes (name is None) to
358+
# make them roundtrip correctly, based on the assumption that in an in-memory LArray an anonymouse axis is more
359+
# likely and useful than an Axis with an empty name.
342360
# TODO : find a more robust and elegant solution
343-
res = res.rename({axis: None for axis in res.axes if isinstance(axis.name, basestring) and 'Unnamed' in axis.name})
361+
res = res.rename({axis: None for axis in res.axes if isinstance(axis.name, basestring) and
362+
(axis.name == '' or 'Unnamed:' in axis.name)})
344363
return res

larray/tests/test_array.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3423,6 +3423,46 @@ def test_read_excel_pandas():
34233423

34243424

34253425
def test_from_lists():
3426+
simple_arr = ndtest((2, 2, 3))
3427+
3428+
# simple
3429+
arr_list = [['a', 'b\\c', 'c0', 'c1', 'c2'],
3430+
['a0', 'b0', 0, 1, 2],
3431+
['a0', 'b1', 3, 4, 5],
3432+
['a1', 'b0', 6, 7, 8],
3433+
['a1', 'b1', 9, 10, 11]]
3434+
res = from_lists(arr_list)
3435+
assert_array_equal(res, simple_arr)
3436+
3437+
# simple (using dump). This should be the same test than above.
3438+
# We just make sure dump() and from_lists() round-trip correctly.
3439+
arr_list = simple_arr.dump()
3440+
res = from_lists(arr_list)
3441+
assert_array_equal(res, simple_arr)
3442+
3443+
# with anonymous axes
3444+
arr_anon = simple_arr.rename({0: None, 1: None, 2: None})
3445+
arr_list = arr_anon.dump()
3446+
assert arr_list == [[None, None, 'c0', 'c1', 'c2'],
3447+
['a0', 'b0', 0, 1, 2],
3448+
['a0', 'b1', 3, 4, 5],
3449+
['a1', 'b0', 6, 7, 8],
3450+
['a1', 'b1', 9, 10, 11]]
3451+
res = from_lists(arr_list, nb_axes=3)
3452+
assert_array_equal(res, arr_anon)
3453+
3454+
# with empty ('') axes names
3455+
arr_empty_names = simple_arr.rename({0: '', 1: '', 2: ''})
3456+
arr_list = arr_empty_names.dump()
3457+
assert arr_list == [[ '', '', 'c0', 'c1', 'c2'],
3458+
['a0', 'b0', 0, 1, 2],
3459+
['a0', 'b1', 3, 4, 5],
3460+
['a1', 'b0', 6, 7, 8],
3461+
['a1', 'b1', 9, 10, 11]]
3462+
res = from_lists(arr_list, nb_axes=3)
3463+
# this is purposefully NOT arr_empty_names because from_lists (via df_asarray) transforms '' axes to None
3464+
assert_array_equal(res, arr_anon)
3465+
34263466
# sort_rows
34273467
arr = from_lists([['sex', 'nat\\year', 1991, 1992, 1993],
34283468
['F', 'BE', 0, 0, 1],
@@ -3549,12 +3589,11 @@ def test_from_frame():
35493589
# i0 10
35503590
df = pd.DataFrame([10], index=pd.Index(['i0'], name=0), columns=['c0'])
35513591
res = from_frame(df)
3592+
expected = Array([[10]], [Axis(['i0'], name=0), Axis(['c0'])])
35523593
assert res.ndim == 2
35533594
assert res.shape == (1, 1)
3554-
assert res.axes.names == ['0', None]
3555-
assert list(res.axes[0].labels) == ['i0']
3556-
assert list(res.axes[1].labels) == ['c0']
3557-
assert_array_equal(res, Array([[10]], "0=i0;c0,"))
3595+
assert res.axes.names == [0, None]
3596+
assert_array_equal(res, expected)
35583597

35593598
# anonymous index
35603599
# input dataframe:
@@ -3846,6 +3885,21 @@ def test_from_frame():
38463885
assert la.axes.names == ['age', 'sex', 'time']
38473886
assert_array_equal(la[0, 'F', :], [3722, 3395, 3347])
38483887

3888+
# 3C) Dataframe with no axe names (names are None)
3889+
# ===============================
3890+
arr_no_names = ndtest("a0,a1;b0..b2;c0..c3")
3891+
df_no_names = arr_no_names.df
3892+
res = from_frame(df_no_names)
3893+
assert_array_equal(res, arr_no_names)
3894+
3895+
# 3D) Dataframe with empty axe names (names are '')
3896+
# ==================================
3897+
arr_empty_names = ndtest("=a0,a1;=b0..b2;=c0..c3")
3898+
assert arr_empty_names.axes.names == ['', '', '']
3899+
df_no_names = arr_empty_names.df
3900+
res = from_frame(df_no_names)
3901+
assert_array_equal(res, arr_empty_names)
3902+
38493903
# 4) test sort_rows and sort_columns arguments
38503904
# ============================================
38513905
age = Axis('age=2,0,1,3')
@@ -4350,7 +4404,10 @@ def test_open_excel(tmpdir):
43504404
with open_excel(fpath, overwrite_file=True) as wb:
43514405
wb[0] = arr.dump()
43524406
res = wb[0].load()
4353-
assert arr.equals(res)
4407+
# the result should be identical to the original array except we lost the information about
4408+
# the wildcard axis being a wildcard axis
4409+
expected = arr.set_axes('b', Axis([0, 1], 'b'))
4410+
assert_array_equal(res, expected)
43544411

43554412
# 6) crash test
43564413
# =============

0 commit comments

Comments
 (0)