fix #830 and #831: fixed loading files with several index columns (axes) with the same name (including an empty name)

gdementen · gdementen · commit 5a365d087808 · 2019-12-16T08:59:46.000+01:00
also improved roundtrip between larray and pandas objects with no name (None as the name).
To make the transition as faithful as possible, '' should only be replaced by None, and other non string names converted to strings, in methods used during loading *text*-based formats (df_asarray),
not in all dataframe importations (from_frame).

Note that the symmetric change (to replace None by '' when writing text-based formats) is AFAICT not necessary because it happens automatically in both xlwings and Dataframe.to_csv
diff --git a/doc/source/changes/version_0_32_1.rst.inc b/doc/source/changes/version_0_32_1.rst.inc
@@ -1,4 +1,4 @@
-.. py:currentmodule:: larray
+﻿.. py:currentmodule:: larray
 
 
 Miscellaneous improvements
@@ -10,4 +10,5 @@ Miscellaneous improvements
 Fixes
 ^^^^^
 
-* fixed something (closes :issue:`1`).
+* fixed loading arrays with more than 2 dimensions but no axes names (even when specifying nb_axes explicitly). This
+  case mostly occurs when trying to load a specific range of an Excel file (closes :issue:`830` and :issue:`831`).
diff --git a/larray/core/array.py b/larray/core/array.py
@@ -2544,14 +2544,22 @@ def dump(self, header=True, wide=True, value_name='value', light=False, axes_nam
             if _axes_display_names:
                 axes_names = self.axes.display_names[:]
             else:
-                axes_names = [axis_name if axis_name is not None else '' for axis_name in self.axes.names]
+                axes_names = self.axes.names
 
             # transforms ['a', 'b', 'c', 'd'] into ['a', 'b', 'c\\d']
             if wide and len(axes_names) > 1:
                 if dump_axes_names is True:
-                    separator = '\\' if axes_names[-1] else ''
-                    axes_names[-2] = separator.join(axes_names[-2:])
-                    axes_names.pop()
+                    # combine two last names
+                    last_name = axes_names.pop()
+                    prev_name = axes_names[-1]
+                    # do not combine if last_name is None or ''
+                    if last_name:
+                        prev_name = prev_name if prev_name is not None else ''
+                        combined_name = prev_name + '\\' + last_name
+                    else:
+                        # whether it is a string or None !
+                        combined_name = prev_name
+                    axes_names[-1] = combined_name
                 elif dump_axes_names == 'except_last':
                     axes_names = axes_names[:-1]
                 else:
@@ -8639,6 +8647,18 @@ def index_if_exists(a, axis, i):
                 return a[a_axis[axis.labels[i]]]
             else:
                 return a
+        # CHECK: try something like:
+        # def index_if_exists(a, igroup):
+        #     axis = igroup.axis
+        #     if isinstance(a, Array) and axis in a.axes:
+        #         a_axis = a.axes[axis]
+        #         return a[a_axis[axis.labels[i]]]
+        #     else:
+        #         return a
+        # for i in axis.i[1:]:
+        #     i_mult = index_if_exists(mult, i)
+        #     i_inc = index_if_exists(inc, i)
+        #     res[i] = res[i - 1] * i_mult + i_inc
         for i in range(1, len(axis)):
             i_mult = index_if_exists(mult, axis, i)
             i_inc = index_if_exists(inc, axis, i)
diff --git a/larray/inout/misc.py b/larray/inout/misc.py
@@ -6,7 +6,7 @@
 from larray.util.misc import deprecate_kwarg
 from larray.util.compat import StringIO
 from larray.inout.common import _get_index_col
-from larray.inout.pandas import df_asarray
+from larray.inout.pandas import df_asarray, set_dataframe_index_by_position
 from larray.inout.csv import read_csv
 
 
@@ -112,7 +112,7 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=nan, sort_rows=Fal
 
     df = DataFrame(data[1:], columns=data[0])
     if index_col is not None:
-        df.set_index([df.columns[c] for c in index_col], inplace=True)
+        df = set_dataframe_index_by_position(df, index_col)
 
     return df_asarray(df, raw=index_col is None, parse_header=False, sort_rows=sort_rows, sort_columns=sort_columns,
                       fill_value=fill_value, wide=wide)
diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py
@@ -43,7 +43,7 @@ def index_to_labels(idx, sort=True):
         if sort:
             return list(idx.levels)
         else:
-            return [list(unique(idx.get_level_values(l))) for l in idx.names]
+            return [list(unique(idx.get_level_values(l))) for l in range(idx.nlevels)]
     else:
         assert isinstance(idx, pd.core.index.Index)
         labels = list(idx.values)
@@ -225,7 +225,6 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
     if unfold_last_axis_name:
         if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]:
             last_axes = [name.strip() for name in axes_names[-1].split('\\')]
-            last_axes = [name if name else None for name in last_axes]
             axes_names = axes_names[:-1] + last_axes
         else:
             axes_names += [None]
@@ -244,14 +243,32 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
     # Pandas treats column labels as column names (strings) so we need to convert them to values
     last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
     axes_labels.append(last_axis_labels)
-    axes_names = [str(name) if name is not None else name
-                  for name in axes_names]
 
     axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
     data = df.values.reshape(axes.shape)
     return Array(data, axes, meta=meta)
 
 
+def set_dataframe_index_by_position(df, index_col_indices):
+    """
+    equivalent to Dataframe.set_index but with column indices, not column labels
+
+    This is necessary to support creating an index from columns without a name or with duplicate names.
+
+    Returns a new Dataframe
+    """
+    if not isinstance(index_col_indices, list):
+        index_col_indices = [index_col_indices]
+    index_col_indices_set = set(index_col_indices)
+    index_col_values = [df.iloc[:, i] for i in index_col_indices]
+    non_index_col_indices = [i for i in range(len(df.columns)) if i not in index_col_indices_set]
+    # drop the index columns from the "normal" columns of the dataframe
+    df = df.iloc[:, non_index_col_indices]
+    # add them back as index columns
+    df.set_index(index_col_values, inplace=True)
+    return df
+
+
 def df_asarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, cartesian_prod=True,
                **kwargs):
     r"""
@@ -307,12 +324,10 @@ def df_asarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=
 
             # This is required to handle int column names (otherwise we can simply use column positions in set_index).
             # This is NOT the same as df.columns[list(range(...))] !
-            index_columns = [df.columns[i] for i in range(pos_last + 1)]
-            df.set_index(index_columns, inplace=True)
+            df = set_dataframe_index_by_position(df, list(range(pos_last + 1)))
         else:
-            index_columns = [df.columns[i] for i in range(len(df.columns) - 1)]
-            df.set_index(index_columns, inplace=True)
-            series = df[df.columns[-1]]
+            df = set_dataframe_index_by_position(df, list(range(len(df.columns) - 1)))
+            series = df.iloc[:, -1]
             series.name = df.index.name
             return from_series(series, sort_rows=sort_columns, **kwargs)
 
@@ -339,6 +354,10 @@ def parse_axis_name(name):
                          unfold_last_axis_name=unfold_last_axis_name, cartesian_prod=cartesian_prod, **kwargs)
 
     # ugly hack to avoid anonymous axes converted as axes with name 'Unnamed: x' by pandas
+    # we also take the opportunity to change axes with empty name to real anonymous axes (name is None) to
+    # make them roundtrip correctly, based on the assumption that in an in-memory LArray an anonymouse axis is more
+    # likely and useful than an Axis with an empty name.
     # TODO : find a more robust and elegant solution
-    res = res.rename({axis: None for axis in res.axes if isinstance(axis.name, basestring) and 'Unnamed' in axis.name})
+    res = res.rename({axis: None for axis in res.axes if isinstance(axis.name, basestring) and
+                      (axis.name == '' or 'Unnamed:' in axis.name)})
     return res
diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py
@@ -3423,6 +3423,46 @@ def test_read_excel_pandas():
 
 
 def test_from_lists():
+    simple_arr = ndtest((2, 2, 3))
+
+    # simple
+    arr_list = [['a', 'b\\c', 'c0', 'c1', 'c2'],
+                ['a0', 'b0', 0, 1, 2],
+                ['a0', 'b1', 3, 4, 5],
+                ['a1', 'b0', 6, 7, 8],
+                ['a1', 'b1', 9, 10, 11]]
+    res = from_lists(arr_list)
+    assert_array_equal(res, simple_arr)
+
+    # simple (using dump). This should be the same test than above.
+    # We just make sure dump() and from_lists() round-trip correctly.
+    arr_list = simple_arr.dump()
+    res = from_lists(arr_list)
+    assert_array_equal(res, simple_arr)
+
+    # with anonymous axes
+    arr_anon = simple_arr.rename({0: None, 1: None, 2: None})
+    arr_list = arr_anon.dump()
+    assert arr_list == [[None, None, 'c0', 'c1', 'c2'],
+                        ['a0', 'b0',    0,    1,    2],
+                        ['a0', 'b1',    3,    4,    5],
+                        ['a1', 'b0',    6,    7,    8],
+                        ['a1', 'b1',    9,   10,   11]]
+    res = from_lists(arr_list, nb_axes=3)
+    assert_array_equal(res, arr_anon)
+
+    # with empty ('') axes names
+    arr_empty_names = simple_arr.rename({0: '', 1: '', 2: ''})
+    arr_list = arr_empty_names.dump()
+    assert arr_list == [[  '',   '', 'c0', 'c1', 'c2'],
+                        ['a0', 'b0',    0,    1,    2],
+                        ['a0', 'b1',    3,    4,    5],
+                        ['a1', 'b0',    6,    7,    8],
+                        ['a1', 'b1',    9,   10,   11]]
+    res = from_lists(arr_list, nb_axes=3)
+    # this is purposefully NOT arr_empty_names because from_lists (via df_asarray) transforms '' axes to None
+    assert_array_equal(res, arr_anon)
+
     # sort_rows
     arr = from_lists([['sex', 'nat\\year', 1991, 1992, 1993],
                       ['F', 'BE', 0, 0, 1],
@@ -3549,12 +3589,11 @@ def test_from_frame():
     #    i0  10
     df = pd.DataFrame([10], index=pd.Index(['i0'], name=0), columns=['c0'])
     res = from_frame(df)
+    expected = Array([[10]], [Axis(['i0'], name=0), Axis(['c0'])])
     assert res.ndim == 2
     assert res.shape == (1, 1)
-    assert res.axes.names == ['0', None]
-    assert list(res.axes[0].labels) == ['i0']
-    assert list(res.axes[1].labels) == ['c0']
-    assert_array_equal(res, Array([[10]], "0=i0;c0,"))
+    assert res.axes.names == [0, None]
+    assert_array_equal(res, expected)
 
     # anonymous index
     # input dataframe:
@@ -3846,6 +3885,21 @@ def test_from_frame():
     assert la.axes.names == ['age', 'sex', 'time']
     assert_array_equal(la[0, 'F', :], [3722, 3395, 3347])
 
+    # 3C) Dataframe with no axe names (names are None)
+    # ===============================
+    arr_no_names = ndtest("a0,a1;b0..b2;c0..c3")
+    df_no_names = arr_no_names.df
+    res = from_frame(df_no_names)
+    assert_array_equal(res, arr_no_names)
+
+    # 3D) Dataframe with empty axe names (names are '')
+    # ==================================
+    arr_empty_names = ndtest("=a0,a1;=b0..b2;=c0..c3")
+    assert arr_empty_names.axes.names == ['', '', '']
+    df_no_names = arr_empty_names.df
+    res = from_frame(df_no_names)
+    assert_array_equal(res, arr_empty_names)
+
     # 4) test sort_rows and sort_columns arguments
     # ============================================
     age = Axis('age=2,0,1,3')
@@ -4350,7 +4404,10 @@ def test_open_excel(tmpdir):
     with open_excel(fpath, overwrite_file=True) as wb:
         wb[0] = arr.dump()
         res = wb[0].load()
-        assert arr.equals(res)
+        # the result should be identical to the original array except we lost the information about
+        # the wildcard axis being a wildcard axis
+        expected = arr.set_axes('b', Axis([0, 1], 'b'))
+        assert_array_equal(res, expected)
 
     # 6) crash test
     # =============