rprof is only accessible through step object

amorison · amorison · commit f886be05a5e4 · 2020-09-14T12:44:56.000+01:00
The huge pandas dataframe grouping all rprof no longer exists.  This
paves the way for step by step reading of the hdf5 file, and allows us
to read rprof data even if the number of profiles increases with time.
diff --git a/stagpy/_step.py b/stagpy/_step.py
@@ -442,11 +442,11 @@ def timeinfo(self):
     def rprof(self):
         """Radial profiles data of the time step.
 
-        Set to None if no radial profiles data is available for this time step.
+        This is a :class:`pandas.DataFrame` with iz as index and variable names
+        as columns.  Set to None if no radial profiles data is available for
+        this time step.
         """
-        if self.istep not in self.sdat.rprof.index.levels[0]:
-            return None
-        return self.sdat.rprof.loc[self.istep]
+        return self.sdat._rprof_and_times[0].get(self.istep)
 
     @property
     def isnap(self):
diff --git a/stagpy/plates.py b/stagpy/plates.py
@@ -593,8 +593,7 @@ def main_plates(sdat):
     """Plot several plates information."""
     # calculating averaged horizontal surface velocity
     # needed for redimensionalisation
-    ilast = sdat.rprof.index.levels[0][-1]
-    rlast = sdat.rprof.loc[ilast]
+    rlast = sdat.snaps[-1].rprof
     nprof = 0
     uprof_averaged = rlast.loc[:, 'vhrms'] * 0
     for step in sdat.walk.filter(rprof=True):
diff --git a/stagpy/rprof.py b/stagpy/rprof.py
@@ -205,8 +205,6 @@ def cmd():
         conf.core
     """
     sdat = StagyyData()
-    if sdat.rprof is None:
-        return
 
     if conf.rprof.grid:
         for step in sdat.walk.filter(rprof=True):
diff --git a/stagpy/stagyydata.py b/stagpy/stagyydata.py
@@ -601,7 +601,7 @@ def _rprof_and_times(self):
             rproffile = self.filename('rprof.h5')
             self._stagdat['rprof'] = stagyyparsers.rprof_h5(
                 rproffile, list(phyvars.RPROF.keys()))
-            if self._stagdat['rprof'][0] is not None:
+            if self._stagdat['rprof'][1] is not None:
                 return self._stagdat['rprof']
             rproffile = self.filename('rprof.dat')
             if self.hdf5 and not rproffile.is_file():
@@ -611,15 +611,6 @@ def _rprof_and_times(self):
                 rproffile, list(phyvars.RPROF.keys()))
         return self._stagdat['rprof']
 
-    @property
-    def rprof(self):
-        """Radial profiles data.
-
-        This is a :class:`pandas.DataFrame` with a 2-level index (istep and iz)
-        and variable names as columns.
-        """
-        return self._rprof_and_times[0]
-
     @property
     def rtimes(self):
         """Radial profiles times.
diff --git a/stagpy/stagyyparsers.py b/stagpy/stagyyparsers.py
@@ -6,7 +6,7 @@
     of :class:`~stagpy.stagyydata.StagyyData`.
 """
 from functools import partial
-from itertools import product, repeat
+from itertools import product
 from operator import itemgetter
 from xml.etree import ElementTree as xmlET
 import re
@@ -112,11 +112,10 @@ def time_series_h5(timefile, colnames):
     return pdf.loc[~pdf.index.duplicated(keep='last')]
 
 
-def _extract_rsnap_isteps(rproffile):
-    """Extract istep and compute list of rows to delete."""
+def _extract_rsnap_isteps(rproffile, data):
+    """Extract istep, time and build separate rprof df."""
     step_regex = re.compile(r'^\*+step:\s*(\d+) ; time =\s*(\S+)')
-    isteps = []  # list of (istep, time, nz)
-    rows_to_del = set()
+    isteps = []  # list of (istep, time, df)
     line = ' '
     with rproffile.open() as stream:
         while line[0] != '*':
@@ -128,22 +127,19 @@ def _extract_rsnap_isteps(rproffile):
         iline = 0
         for line in stream:
             if line[0] == '*':
-                isteps.append((istep, time, nlines))
+                isteps.append((istep, time, data.iloc[iline - nlines:iline]))
                 match = step_regex.match(line)
                 istep = int(match.group(1))
                 time = float(match.group(2))
                 nlines = 0
                 # remove useless lines produced when run is restarted
-                nrows_to_del = 0
                 while isteps and istep <= isteps[-1][0]:
-                    nrows_to_del += isteps.pop()[-1]
-                rows_to_del = rows_to_del.union(
-                    range(iline - nrows_to_del, iline))
+                    isteps.pop()
             else:
                 nlines += 1
                 iline += 1
-        isteps.append((istep, time, nlines))
-    return isteps, rows_to_del
+        isteps.append((istep, time, data.iloc[iline - nlines:iline]))
+    return isteps
 
 
 def rprof(rproffile, colnames):
@@ -156,39 +152,32 @@ def rprof(rproffile, colnames):
     Args:
         rproffile (:class:`pathlib.Path`): path of the rprof.dat file.
         colnames (list of names): names of the variables expected in
-            :data:`rproffile` (may be modified).
+            :data:`rproffile`.
 
     Returns:
-        tuple of :class:`pandas.DataFrame`: (profs, times)
-            :data:`profs` are the radial profiles, with the variables in
-            columns and rows double-indexed with the time step and the radial
-            index of numerical cells.
+        tuple: (profs, times)
+            :data:`profs` is a dict mapping istep to radial profiles
+            :class:`pandas.DataFrame`.
 
             :data:`times` is the dimensionless time indexed by time steps.
     """
     if not rproffile.is_file():
-        return None, None
+        return {}, None
     data = pd.read_csv(rproffile, delim_whitespace=True, dtype=str,
                        header=None, comment='*', skiprows=1,
                        engine='c', memory_map=True,
                        error_bad_lines=False, warn_bad_lines=False)
     data = data.apply(pd.to_numeric, raw=True, errors='coerce')
 
-    isteps, rows_to_del = _extract_rsnap_isteps(rproffile)
-    if rows_to_del:
-        rows_to_keep = set(range(len(data))) - rows_to_del
-        data = data.take(list(rows_to_keep))
+    isteps = _extract_rsnap_isteps(rproffile, data)
 
-    id_arr = [[], []]
-    for istep, _, n_z in isteps:
-        id_arr[0].extend(repeat(istep, n_z))
-        id_arr[1].extend(range(n_z))
-
-    data.index = id_arr
-
-    ncols = data.shape[1]
-    _tidy_names(colnames, ncols)
-    data.columns = colnames
+    data = {}
+    for istep, _, step_df in isteps:
+        step_df.index = range(step_df.shape[0])  # check whether necessary
+        step_cols = list(colnames)
+        _tidy_names(step_cols, step_df.shape[1])
+        step_df.columns = step_cols
+        data[istep] = step_df
 
     df_times = pd.DataFrame(list(map(itemgetter(1), isteps)),
                             index=map(itemgetter(0), isteps))
@@ -207,38 +196,32 @@ def rprof_h5(rproffile, colnames):
             :data:`rproffile`.
 
     Returns:
-        tuple of :class:`pandas.DataFrame`: (profs, times)
-            :data:`profs` are the radial profiles, with the variables in
-            columns and rows double-indexed with the time step and the radial
-            index of numerical cells.
+        tuple: (profs, times)
+            :data:`profs` is a dict mapping istep to radial profiles
+            :class:`pandas.DataFrame`.
 
             :data:`times` is the dimensionless time indexed by time steps.
     """
     if not rproffile.is_file():
-        return None, None
+        return {}, None
     isteps = []
+    data = {}
     with h5py.File(rproffile, 'r') as h5f:
         dnames = sorted(dname for dname in h5f.keys()
                         if dname.startswith('rprof_'))
-        ncols = h5f['names'].shape[0]
         h5names = map(bytes.decode, h5f['names'][len(colnames):])
-        _tidy_names(colnames, ncols, h5names)
-        data = np.zeros((0, ncols))
         for dname in dnames:
             dset = h5f[dname]
-            data = np.concatenate((data, dset[()]))
-            isteps.append((dset.attrs['istep'], dset.attrs['time'],
-                           dset.shape[0]))
-
-    id_arr = [[], []]
-    for istep, _, n_z in isteps:
-        id_arr[0].extend(repeat(istep, n_z))
-        id_arr[1].extend(range(n_z))
+            arr = dset[()]
+            istep = dset.attrs['istep']
+            step_cols = list(colnames)
+            _tidy_names(step_cols, arr.shape[1], h5names)  # check shape
+            data[istep] = pd.DataFrame(arr, columns=step_cols)
+            isteps.append((istep, dset.attrs['time']))
 
-    df_data = pd.DataFrame(data, index=id_arr, columns=colnames)
     df_times = pd.DataFrame(list(map(itemgetter(1), isteps)),
                             index=map(itemgetter(0), isteps))
-    return df_data, df_times
+    return data, df_times
 
 
 def _clean_names_refstate(names):
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -16,12 +16,13 @@ def test_time_series_invalid_prs():
 def test_rprof_prs(sdat):
     names = ['aa', 'bb', 'cc']
     data, time = prs.rprof(sdat.filename('rprof.dat'), list(names))
-    assert (data.columns[:3] == names).all()
-    assert (data.columns[3:] == list(range(data.shape[1] - 3))).all()
+    assert all((df.columns[:3] == names).all() for df in data.values())
+    assert all((df.columns[3:] == list(range(df.shape[1] - 3))).all()
+               for df in data.values())
 
 
 def test_rprof_invalid_prs():
-    assert prs.rprof(pathlib.Path('dummy'), []) == (None, None)
+    assert prs.rprof(pathlib.Path('dummy'), []) == ({}, None)
 
 
 def test_fields_prs(sdat):
diff --git a/tests/test_stagyydata.py b/tests/test_stagyydata.py
@@ -38,8 +38,8 @@ def test_sdat_tseries(sdat):
     assert isinstance(sdat.tseries, pandas.DataFrame)
 
 
-def test_sdat_rprof(sdat):
-    assert isinstance(sdat.rprof, pandas.DataFrame)
+def test_sdat_rtimes(sdat):
+    assert isinstance(sdat.rtimes, pandas.DataFrame)
 
 
 def test_sdat_walk_dflt(sdat):