dev

davidhassell · davidhassell · commit b93e2d3f9013 · 2024-03-23T18:51:39.000Z
diff --git a/Changelog.rst b/Changelog.rst
@@ -9,6 +9,8 @@ version NEXT
   to regrid the vertical axis in logarithmic coordinates to
   `cf.Field.regrids` and `cf.Field.regridc`
   (https://github.com/NCAS-CMS/cf-python/issues/715)
+* Improve the performance of reading and accessing the data of PP and
+  UM fields files (https://github.com/NCAS-CMS/cf-python/issues/7??)
 * Improve `cf.Field.collapse` performance by lazily computing reduced
   axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
 * Fix misleading error message when it is not possible to create area
diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py
@@ -276,9 +276,12 @@ def _get_rec(self, f, header_offset):
 
         # ------------------------------------------------------------
         # Leave the following commented code here for debugging
-        # purposes. If you replace the above line with this commented
-        # code, then you must aslo set 'parse=True' in the `open`
-        # method.
+        # purposes. Replacing the above line with this code moves the
+        # calculation of the data offset and disk length from pure
+        # Python to the C library, at the expense of completely
+        # parsing the file. Note: If you do replace the above line
+        # with the commented code, then you *must* also set
+        # 'parse=True' in the `open` method.
         # ------------------------------------------------------------
 
         # for v in f.vars:
@@ -672,16 +675,18 @@ def get_word_size(self):
         return self._get_component("word_size", None)
 
     def open(self):
-        """Returns an open dataset containing the data array.
+        """Returns an open dataset and the address of the data.
 
         :Returns:
 
-            `umfile_lib.File`, `int`
+            `umfile_lib.umfile.File`, `int`
+                The open file object, and the start address in bytes
+                of the lookup header.
 
         **Examples**
 
         >>> f.open()
-        (<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 44567)
+        (<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 4)
 
         """
         return super().open(
diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py
@@ -3439,7 +3439,7 @@ def read(
         else:
             byte_ordering = None
 
-        f = self.file_open(filename)
+        f = self.file_open(filename, parse=True)
 
         info = is_log_level_info(logger)
 
@@ -3472,7 +3472,7 @@ def _open_um_file(
         fmt=None,
         word_size=None,
         byte_ordering=None,
-            parse=True
+        parse=True,
     ):
         """Open a UM fields file or PP file.
 
@@ -3481,10 +3481,18 @@ def _open_um_file(
             filename: `str`
                 The file to be opened.
 
+            parse: `bool`, optional
+                If True, the default, then parse the contents. If
+                False then the contents are not parsed, which can be
+                considerable faster in cases when the contents are not
+                required.
+
+                .. versionadded:: NEXTVERSION
+
         :Returns:
 
-            `umread.umfile.File`
-                The opened file with an open file descriptor.
+            `umread_lib.umfile.File`
+                The open PP or FF file object.
 
         """
         self.file_close()
@@ -3494,7 +3502,7 @@ def _open_um_file(
                 byte_ordering=byte_ordering,
                 word_size=word_size,
                 fmt=fmt,
-                parse=parse
+                parse=parse,
             )
         except Exception as error:
             try:
@@ -3529,8 +3537,10 @@ def is_um_file(self, filename):
 
         """
         try:
+            # Note: No need to completely parse the file to ascertain
+            #       if it's PP of FF.
             self.file_open(filename, parse=False)
-        except Exception as error:
+        except Exception:
             self.file_close()
             return False
         else:
@@ -3559,8 +3569,19 @@ def file_open(self, filename, parse=True):
             filename: `str`
                 The file to be read.
 
+            parse: `bool`, optional
+                If True, the default, then parse the contents. If
+                False then the contents are not parsed, which can be
+                considerable faster in cases when the contents are not
+                required.
+
+                .. versionadded:: NEXTVERSION
+
         :Returns:
 
+            `umread_lib.umfile.File`
+                The open PP or FF file object.
+
         """
         g = getattr(self, "read_vars", {})
 
@@ -3569,7 +3590,7 @@ def file_open(self, filename, parse=True):
             byte_ordering=g.get("byte_ordering"),
             word_size=g.get("word_size"),
             fmt=g.get("fmt"),
-            parse=parse
+            parse=parse,
         )
 
 
diff --git a/cf/test/test_pp.py b/cf/test/test_pp.py
@@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self):
 
         f = cf.read(self.ppfile)[0]
 
-        # TODO: reinstate "CFA4" at version>3.14
-        for fmt in ("NETCDF4",):  # "CFA4"):
-            cf.write(f, tmpfile, fmt=fmt)
+        for cfa in (False, True):
+            cf.write(f, tmpfile, cfa=cfa)
             g = cf.read(tmpfile)[0]
 
             self.assertTrue((f.array == array).all())
diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py
@@ -551,7 +551,7 @@ def test_write_netcdf_mode(self):
 
     def test_read_write_netCDF4_compress_shuffle(self):
         f = cf.read(self.filename)[0]
-        # TODO: reinstate "CFA4" at version > 3.14
+        # TODODASK: reinstate "CFA4" at version > 3.14
         for fmt in ("NETCDF4", "NETCDF4_CLASSIC"):  # , "CFA4"):
             cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True)
             g = cf.read(tmpfile)[0]
@@ -920,6 +920,7 @@ def test_write_omit_data(self):
         self.assertFalse(g.array.count())
         self.assertTrue(g.construct("grid_latitude").array.count())
 
+    @unittest.skipUnless(False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION")
     def test_read_url(self):
         """Test reading urls."""
         for scheme in ("http", "https"):
diff --git a/cf/umread_lib/umfile.py b/cf/umread_lib/umfile.py
@@ -10,10 +10,12 @@
 class UMFileException(Exception):
     pass
 
-# Integer header pointers
-LBLREC = 14
-LBPACK = 20
-LBEGIN = 28
+
+# Lookup header pointers
+LBLREC = 14  # Length of data record (including any extra data)
+LBPACK = 20  # Packing method indicator
+LBEGIN = 28  # Disk address/Start Record
+
 
 class File:
     """A class for a UM file that gives a view of the file including
@@ -286,21 +288,21 @@ def __init__(
 
     @classmethod
     def from_file_and_offsets(
-        cls, file, header_offset, data_offset=None, disk_length=None
+        cls, file, hdr_offset, data_offset=None, disk_length=None
     ):
         """Instantiate a `Rec` object from the `File` object and the
         header and data offsets.
 
-        The headers are read in, and also the record object is ready for
-        calling `get_data`.
+        The lookup header are read in immediately, and the returned
+        record object is ready for calling `get_data`.
 
         :Parameters:
 
             file: `File`
                 A view of a file including sets of PP records combined
                 into variables.
 
-            header_offset: `int`
+            hdr_offset: `int`
                 The file start address of the header, in bytes.
 
             data_offset: `int`, optional
@@ -311,7 +313,7 @@ def from_file_and_offsets(
             disk_length: `int`
                 The length in bytes of the data in the file. If
                 `None`, the default, then the disk length will be
-                calculated from the integer header.
+                calculated from the integer.
 
         :Returns:
 
@@ -321,32 +323,33 @@ def from_file_and_offsets(
         c = file._c_interface
         word_size = file.word_size
         int_hdr, real_hdr = c.read_header(
-            file.fd, header_offset, file.byte_ordering, word_size
+            file.fd, hdr_offset, file.byte_ordering, word_size
         )
 
         if data_offset is None:
             # Calculate the data offset from the integer header
             if file.fmt == "PP":
                 # We only support 64-word headers, so the data starts
-                # 66 words after the header_offset, i.e. 64 words of the
-                # header, plus 2 block control words.
-                data_offset = header_offset + 66 * word_size
+                # 66 words after the header_offset, i.e. after 64
+                # words of the header, plus 2 block control words.
+                data_offset = hdr_offset + 66 * word_size
             else:
                 # Fields file
                 data_offset = int_hdr[LBEGIN] * word_size
 
         if disk_length is None:
             # Calculate the disk length from the integer header
+            disk_length = int_hdr[LBLREC]
             if int_hdr[LBPACK] % 10 == 2:
                 # Cray 32-bit packing
-                disk_length = int_hdr[LBLREC] * 4
+                disk_length = disk_length * 4
             else:
-                disk_length = int_hdr[LBLREC] * word_size
+                disk_length = disk_length * word_size
 
         return cls(
             int_hdr,
             real_hdr,
-            header_offset,
+            hdr_offset,
             data_offset,
             disk_length,
             file=file,
@@ -360,8 +363,8 @@ def read_extra_data(self):
             `numpy.ndarray`
 
         """
-        c = self.file._c_interface
         file = self.file
+        c = file._c_interface
 
         (
             extra_data_offset,
@@ -424,17 +427,18 @@ def get_data(self):
             `numpy.ndarray`
 
         """
-        c = self.file._c_interface
         file = self.file
-        data_type, nwords = c.get_type_and_num_words(self.int_hdr)
+        c = file._c_interface
+        int_hdr = self.int_hdr
+        data_type, nwords = c.get_type_and_num_words(int_hdr)
 
         return c.read_record_data(
             file.fd,
             self.data_offset,
             self.disk_length,
             file.byte_ordering,
             file.word_size,
-            self.int_hdr,
+            int_hdr,
             self.real_hdr,
             data_type,
             nwords,