Merge pull request #747 from davidhassell/um-speed

davidhassell · web-flow · commit e94f82a11135 · 2024-03-26T13:29:01.000Z
Improve the performance of reading and accessing the data of PP and UM fields files
diff --git a/Changelog.rst b/Changelog.rst
@@ -9,6 +9,8 @@ version NEXT
   to regrid the vertical axis in logarithmic coordinates to
   `cf.Field.regrids` and `cf.Field.regridc`
   (https://github.com/NCAS-CMS/cf-python/issues/715)
+* Improve the performance of reading and accessing the data of PP and
+  UM fields files (https://github.com/NCAS-CMS/cf-python/issues/746)
 * Improve `cf.Field.collapse` performance by lazily computing reduced
   axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
 * Improve `cf.Field.__getitem__` performance by not re-calculating
diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py
@@ -8,7 +8,7 @@
     load_stash2standard_name,
     parse_indices,
 )
-from ...umread_lib.umfile import File
+from ...umread_lib.umfile import File, Rec
 from .abstract import Array
 from .mixin import FileArrayMixin
 
@@ -272,13 +272,22 @@ def _get_rec(self, f, header_offset):
                 The record container.
 
         """
-        # TODOCFA: This method doesn't require data_offset and disk_length,
-        # so plays nicely with CFA. Is it fast enough that we can
-        # use this method always?
-        for v in f.vars:
-            for r in v.recs:
-                if r.hdr_offset == header_offset:
-                    return r
+        return Rec.from_file_and_offsets(f, header_offset)
+
+        # ------------------------------------------------------------
+        # Leave the following commented code here for debugging
+        # purposes. Replacing the above line with this code moves the
+        # calculation of the data offset and disk length from pure
+        # Python to the C library, at the expense of completely
+        # parsing the file. Note: If you do replace the above line
+        # with the commented code, then you *must* also set
+        # 'parse=True' in the `open` method.
+        # ------------------------------------------------------------
+
+        # for v in f.vars:
+        #     for r in v.recs:
+        #         if r.hdr_offset == header_offset:
+        #             return r
 
     def _set_units(self, int_hdr):
         """The units and calendar properties.
@@ -666,21 +675,24 @@ def get_word_size(self):
         return self._get_component("word_size", None)
 
     def open(self):
-        """Returns an open dataset containing the data array.
+        """Returns an open dataset and the address of the data.
 
         :Returns:
 
-            `umfile_lib.File`, `int`
+            `umfile_lib.umfile.File`, `int`
+                The open file object, and the start address in bytes
+                of the lookup header.
 
         **Examples**
 
         >>> f.open()
-        (<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 44567)
+        (<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 4)
 
         """
         return super().open(
             File,
             byte_ordering=self.get_byte_ordering(),
             word_size=self.get_word_size(),
             fmt=self.get_fmt(),
+            parse=False,
         )
diff --git a/cf/field.py b/cf/field.py
@@ -454,8 +454,11 @@ def __getitem__(self, indices):
         # below.
         if org_cyclic:
             new_cyclic = new_data.cyclic()
-            [new.cyclic(i, iscyclic=False) for i in org_cyclic if i not in new_cyclic]
-                    
+            [
+                new.cyclic(i, iscyclic=False)
+                for i in org_cyclic
+                if i not in new_cyclic
+            ]
 
         # ------------------------------------------------------------
         # Subspace constructs with data
diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py
@@ -2469,9 +2469,9 @@ def data_type_in_file(self, rec):
         if rec.int_hdr.item(lbuser2) == 3:
             # Boolean
             return np.dtype(bool)
-        else:
-            # Int or float
-            return rec.get_type_and_num_words()[0]
+
+        # Int or float
+        return rec.get_type_and_num_words()[0]
 
     def printfdr(self, display=False):
         """Print out the contents of PP field headers.
@@ -3439,7 +3439,7 @@ def read(
         else:
             byte_ordering = None
 
-        f = self.file_open(filename)
+        f = self.file_open(filename, parse=True)
 
         info = is_log_level_info(logger)
 
@@ -3472,6 +3472,7 @@ def _open_um_file(
         fmt=None,
         word_size=None,
         byte_ordering=None,
+        parse=True,
     ):
         """Open a UM fields file or PP file.
 
@@ -3480,10 +3481,18 @@ def _open_um_file(
             filename: `str`
                 The file to be opened.
 
+            parse: `bool`, optional
+                If True, the default, then parse the contents. If
+                False then the contents are not parsed, which can be
+                considerably faster in cases when the contents are not
+                required.
+
+                .. versionadded:: NEXTVERSION
+
         :Returns:
 
-            `umread.umfile.File`
-                The opened file with an open file descriptor.
+            `umread_lib.umfile.File`
+                The open PP or FF file object.
 
         """
         self.file_close()
@@ -3493,6 +3502,7 @@ def _open_um_file(
                 byte_ordering=byte_ordering,
                 word_size=word_size,
                 fmt=fmt,
+                parse=parse,
             )
         except Exception as error:
             try:
@@ -3527,7 +3537,9 @@ def is_um_file(self, filename):
 
         """
         try:
-            self.file_open(filename)
+            # Note: No need to completely parse the file to ascertain
+            #       if it's PP or FF.
+            self.file_open(filename, parse=False)
         except Exception:
             self.file_close()
             return False
@@ -3549,16 +3561,27 @@ def file_close(self):
 
         self._um_file = None
 
-    def file_open(self, filename):
+    def file_open(self, filename, parse=True):
         """Open the file for reading.
 
         :Paramters:
 
             filename: `str`
                 The file to be read.
 
+            parse: `bool`, optional
+                If True, the default, then parse the contents. If
+                False then the contents are not parsed, which can be
+                considerably faster in cases when the contents are not
+                required.
+
+                .. versionadded:: NEXTVERSION
+
         :Returns:
 
+            `umread_lib.umfile.File`
+                The open PP or FF file object.
+
         """
         g = getattr(self, "read_vars", {})
 
@@ -3567,6 +3590,7 @@ def file_open(self, filename):
             byte_ordering=g.get("byte_ordering"),
             word_size=g.get("word_size"),
             fmt=g.get("fmt"),
+            parse=parse,
         )
 
 
diff --git a/cf/test/test_pp.py b/cf/test/test_pp.py
@@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self):
 
         f = cf.read(self.ppfile)[0]
 
-        # TODO: reinstate "CFA4" at version>3.14
-        for fmt in ("NETCDF4",):  # "CFA4"):
-            cf.write(f, tmpfile, fmt=fmt)
+        for cfa in (False, True):
+            cf.write(f, tmpfile, cfa=cfa)
             g = cf.read(tmpfile)[0]
 
             self.assertTrue((f.array == array).all())
diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py
@@ -551,7 +551,7 @@ def test_write_netcdf_mode(self):
 
     def test_read_write_netCDF4_compress_shuffle(self):
         f = cf.read(self.filename)[0]
-        # TODO: reinstate "CFA4" at version > 3.14
+        # TODODASK: reinstate "CFA4" at version > 3.14
         for fmt in ("NETCDF4", "NETCDF4_CLASSIC"):  # , "CFA4"):
             cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True)
             g = cf.read(tmpfile)[0]
@@ -920,6 +920,9 @@ def test_write_omit_data(self):
         self.assertFalse(g.array.count())
         self.assertTrue(g.construct("grid_latitude").array.count())
 
+    @unittest.skipUnless(
+        False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL"
+    )
     def test_read_url(self):
         """Test reading urls."""
         for scheme in ("http", "https"):
diff --git a/cf/umread_lib/umfile.py b/cf/umread_lib/umfile.py
@@ -11,6 +11,12 @@ class UMFileException(Exception):
     pass
 
 
+# Lookup header pointers
+LBLREC = 14  # Length of data record (including any extra data)
+LBPACK = 20  # Packing method indicator
+LBEGIN = 28  # Disk address/Start Record
+
+
 class File:
     """A class for a UM file that gives a view of the file including
     sets of PP records combined into variables."""
@@ -34,7 +40,7 @@ def __init__(
                 'little_endian' or 'big_endian'
 
             word_size: `int`, optional
-                4 or 8
+                The size in bytes of one word. Either ``4`` or ``8``.
 
             fmt: `str`, optional
                 'FF' or 'PP'
@@ -281,12 +287,14 @@ def __init__(
             self.file = file
 
     @classmethod
-    def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
+    def from_file_and_offsets(
+        cls, file, hdr_offset, data_offset=None, disk_length=None
+    ):
         """Instantiate a `Rec` object from the `File` object and the
         header and data offsets.
 
-        The headers are read in, and also the record object is ready for
-        calling `get_data`.
+        The lookup header is read from disk immediately, and the
+        returned record object is ready for calling `get_data`.
 
         :Parameters:
 
@@ -295,26 +303,56 @@ def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
                 into variables.
 
             hdr_offset: `int`
-                The start word in the file of the header.
+                The file start address of the header, in bytes.
 
-            data_offset: `int`
-                The start word in the file of the data.
+            data_offset: `int`, optional
+                The file start address of the data, in bytes. If
+                `None`, the default, then the data offset will be
+                calculated from the integer header.
 
             disk_length: `int`
-                The length in words of the data in the file.
+                The length in bytes of the data in the file. If
+                `None`, the default, then the disk length will be
+                calculated from the integer header.
 
         :Returns:
 
              `Rec`
 
         """
         c = file._c_interface
+        word_size = file.word_size
         int_hdr, real_hdr = c.read_header(
-            file.fd, hdr_offset, file.byte_ordering, file.word_size
+            file.fd, hdr_offset, file.byte_ordering, word_size
         )
 
+        if data_offset is None:
+            # Calculate the data offset from the integer header
+            if file.fmt == "PP":
+                # We only support 64-word headers, so the data starts
+                # 66 words after the header_offset, i.e. after 64
+                # words of the header, plus 2 block control words.
+                data_offset = hdr_offset + 66 * word_size
+            else:
+                # Fields file
+                data_offset = int_hdr[LBEGIN] * word_size
+
+        if disk_length is None:
+            # Calculate the disk length from the integer header
+            disk_length = int_hdr[LBLREC]
+            if int_hdr[LBPACK] % 10 == 2:
+                # Cray 32-bit packing
+                disk_length = disk_length * 4
+            else:
+                disk_length = disk_length * word_size
+
         return cls(
-            int_hdr, real_hdr, hdr_offset, data_offset, disk_length, file=file
+            int_hdr,
+            real_hdr,
+            hdr_offset,
+            data_offset,
+            disk_length,
+            file=file,
         )
 
     def read_extra_data(self):
@@ -325,8 +363,8 @@ def read_extra_data(self):
             `numpy.ndarray`
 
         """
-        c = self.file._c_interface
         file = self.file
+        c = file._c_interface
 
         (
             extra_data_offset,
@@ -389,17 +427,18 @@ def get_data(self):
             `numpy.ndarray`
 
         """
-        c = self.file._c_interface
         file = self.file
-        data_type, nwords = c.get_type_and_num_words(self.int_hdr)
+        c = file._c_interface
+        int_hdr = self.int_hdr
+        data_type, nwords = c.get_type_and_num_words(int_hdr)
 
         return c.read_record_data(
             file.fd,
             self.data_offset,
             self.disk_length,
             file.byte_ordering,
             file.word_size,
-            self.int_hdr,
+            int_hdr,
             self.real_hdr,
             data_type,
             nwords,