Skip to content

Commit e94f82a

Browse files
authored
Merge pull request #747 from davidhassell/um-speed
Improve the performance of reading and accessing the data of PP and UM fields files
2 parents 1ba918c + 45497ec commit e94f82a

File tree

7 files changed

+121
-39
lines changed

7 files changed

+121
-39
lines changed

Changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ version NEXT
99
to regrid the vertical axis in logarithmic coordinates to
1010
`cf.Field.regrids` and `cf.Field.regridc`
1111
(https://github.com/NCAS-CMS/cf-python/issues/715)
12+
* Improve the performance of reading and accessing the data of PP and
13+
UM fields files (https://github.com/NCAS-CMS/cf-python/issues/746)
1214
* Improve `cf.Field.collapse` performance by lazily computing reduced
1315
axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
1416
* Improve `cf.Field.__getitem__` performance by not re-calculating

cf/data/array/umarray.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
load_stash2standard_name,
99
parse_indices,
1010
)
11-
from ...umread_lib.umfile import File
11+
from ...umread_lib.umfile import File, Rec
1212
from .abstract import Array
1313
from .mixin import FileArrayMixin
1414

@@ -272,13 +272,22 @@ def _get_rec(self, f, header_offset):
272272
The record container.
273273
274274
"""
275-
# TODOCFA: This method doesn't require data_offset and disk_length,
276-
# so plays nicely with CFA. Is it fast enough that we can
277-
# use this method always?
278-
for v in f.vars:
279-
for r in v.recs:
280-
if r.hdr_offset == header_offset:
281-
return r
275+
return Rec.from_file_and_offsets(f, header_offset)
276+
277+
# ------------------------------------------------------------
278+
# Leave the following commented code here for debugging
279+
# purposes. Replacing the above line with this code moves the
280+
# calculation of the data offset and disk length from pure
281+
# Python to the C library, at the expense of completely
282+
# parsing the file. Note: If you do replace the above line
283+
# with the commented code, then you *must* also set
284+
# 'parse=True' in the `open` method.
285+
# ------------------------------------------------------------
286+
287+
# for v in f.vars:
288+
# for r in v.recs:
289+
# if r.hdr_offset == header_offset:
290+
# return r
282291

283292
def _set_units(self, int_hdr):
284293
"""The units and calendar properties.
@@ -666,21 +675,24 @@ def get_word_size(self):
666675
return self._get_component("word_size", None)
667676

668677
def open(self):
669-
"""Returns an open dataset containing the data array.
678+
"""Returns an open dataset and the address of the data.
670679
671680
:Returns:
672681
673-
`umfile_lib.File`, `int`
682+
`umfile_lib.umfile.File`, `int`
683+
The open file object, and the start address in bytes
684+
of the lookup header.
674685
675686
**Examples**
676687
677688
>>> f.open()
678-
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 44567)
689+
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 4)
679690
680691
"""
681692
return super().open(
682693
File,
683694
byte_ordering=self.get_byte_ordering(),
684695
word_size=self.get_word_size(),
685696
fmt=self.get_fmt(),
697+
parse=False,
686698
)

cf/field.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,11 @@ def __getitem__(self, indices):
454454
# below.
455455
if org_cyclic:
456456
new_cyclic = new_data.cyclic()
457-
[new.cyclic(i, iscyclic=False) for i in org_cyclic if i not in new_cyclic]
458-
457+
[
458+
new.cyclic(i, iscyclic=False)
459+
for i in org_cyclic
460+
if i not in new_cyclic
461+
]
459462

460463
# ------------------------------------------------------------
461464
# Subspace constructs with data

cf/read_write/um/umread.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2469,9 +2469,9 @@ def data_type_in_file(self, rec):
24692469
if rec.int_hdr.item(lbuser2) == 3:
24702470
# Boolean
24712471
return np.dtype(bool)
2472-
else:
2473-
# Int or float
2474-
return rec.get_type_and_num_words()[0]
2472+
2473+
# Int or float
2474+
return rec.get_type_and_num_words()[0]
24752475

24762476
def printfdr(self, display=False):
24772477
"""Print out the contents of PP field headers.
@@ -3439,7 +3439,7 @@ def read(
34393439
else:
34403440
byte_ordering = None
34413441

3442-
f = self.file_open(filename)
3442+
f = self.file_open(filename, parse=True)
34433443

34443444
info = is_log_level_info(logger)
34453445

@@ -3472,6 +3472,7 @@ def _open_um_file(
34723472
fmt=None,
34733473
word_size=None,
34743474
byte_ordering=None,
3475+
parse=True,
34753476
):
34763477
"""Open a UM fields file or PP file.
34773478
@@ -3480,10 +3481,18 @@ def _open_um_file(
34803481
filename: `str`
34813482
The file to be opened.
34823483
3484+
parse: `bool`, optional
3485+
If True, the default, then parse the contents. If
3486+
False then the contents are not parsed, which can be
3487+
considerably faster in cases when the contents are not
3488+
required.
3489+
3490+
.. versionadded:: NEXTVERSION
3491+
34833492
:Returns:
34843493
3485-
`umread.umfile.File`
3486-
The opened file with an open file descriptor.
3494+
`umread_lib.umfile.File`
3495+
The open PP or FF file object.
34873496
34883497
"""
34893498
self.file_close()
@@ -3493,6 +3502,7 @@ def _open_um_file(
34933502
byte_ordering=byte_ordering,
34943503
word_size=word_size,
34953504
fmt=fmt,
3505+
parse=parse,
34963506
)
34973507
except Exception as error:
34983508
try:
@@ -3527,7 +3537,9 @@ def is_um_file(self, filename):
35273537
35283538
"""
35293539
try:
3530-
self.file_open(filename)
3540+
# Note: No need to completely parse the file to ascertain
3541+
# if it's PP or FF.
3542+
self.file_open(filename, parse=False)
35313543
except Exception:
35323544
self.file_close()
35333545
return False
@@ -3549,16 +3561,27 @@ def file_close(self):
35493561

35503562
self._um_file = None
35513563

3552-
def file_open(self, filename):
3564+
def file_open(self, filename, parse=True):
35533565
"""Open the file for reading.
35543566
35553567
:Paramters:
35563568
35573569
filename: `str`
35583570
The file to be read.
35593571
3572+
parse: `bool`, optional
3573+
If True, the default, then parse the contents. If
3574+
False then the contents are not parsed, which can be
3575+
considerably faster in cases when the contents are not
3576+
required.
3577+
3578+
.. versionadded:: NEXTVERSION
3579+
35603580
:Returns:
35613581
3582+
`umread_lib.umfile.File`
3583+
The open PP or FF file object.
3584+
35623585
"""
35633586
g = getattr(self, "read_vars", {})
35643587

@@ -3567,6 +3590,7 @@ def file_open(self, filename):
35673590
byte_ordering=g.get("byte_ordering"),
35683591
word_size=g.get("word_size"),
35693592
fmt=g.get("fmt"),
3593+
parse=parse,
35703594
)
35713595

35723596

cf/test/test_pp.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self):
112112

113113
f = cf.read(self.ppfile)[0]
114114

115-
# TODO: reinstate "CFA4" at version>3.14
116-
for fmt in ("NETCDF4",): # "CFA4"):
117-
cf.write(f, tmpfile, fmt=fmt)
115+
for cfa in (False, True):
116+
cf.write(f, tmpfile, cfa=cfa)
118117
g = cf.read(tmpfile)[0]
119118

120119
self.assertTrue((f.array == array).all())

cf/test/test_read_write.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ def test_write_netcdf_mode(self):
551551

552552
def test_read_write_netCDF4_compress_shuffle(self):
553553
f = cf.read(self.filename)[0]
554-
# TODO: reinstate "CFA4" at version > 3.14
554+
# TODODASK: reinstate "CFA4" at version > 3.14
555555
for fmt in ("NETCDF4", "NETCDF4_CLASSIC"): # , "CFA4"):
556556
cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True)
557557
g = cf.read(tmpfile)[0]
@@ -920,6 +920,9 @@ def test_write_omit_data(self):
920920
self.assertFalse(g.array.count())
921921
self.assertTrue(g.construct("grid_latitude").array.count())
922922

923+
@unittest.skipUnless(
924+
False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL"
925+
)
923926
def test_read_url(self):
924927
"""Test reading urls."""
925928
for scheme in ("http", "https"):

cf/umread_lib/umfile.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ class UMFileException(Exception):
1111
pass
1212

1313

14+
# Lookup header pointers
15+
LBLREC = 14 # Length of data record (including any extra data)
16+
LBPACK = 20 # Packing method indicator
17+
LBEGIN = 28 # Disk address/Start Record
18+
19+
1420
class File:
1521
"""A class for a UM file that gives a view of the file including
1622
sets of PP records combined into variables."""
@@ -34,7 +40,7 @@ def __init__(
3440
'little_endian' or 'big_endian'
3541
3642
word_size: `int`, optional
37-
4 or 8
43+
The size in bytes of one word. Either ``4`` or ``8``.
3844
3945
fmt: `str`, optional
4046
'FF' or 'PP'
@@ -281,12 +287,14 @@ def __init__(
281287
self.file = file
282288

283289
@classmethod
284-
def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
290+
def from_file_and_offsets(
291+
cls, file, hdr_offset, data_offset=None, disk_length=None
292+
):
285293
"""Instantiate a `Rec` object from the `File` object and the
286294
header and data offsets.
287295
288-
The headers are read in, and also the record object is ready for
289-
calling `get_data`.
296+
The lookup header is read from disk immediately, and the
297+
returned record object is ready for calling `get_data`.
290298
291299
:Parameters:
292300
@@ -295,26 +303,56 @@ def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
295303
into variables.
296304
297305
hdr_offset: `int`
298-
The start word in the file of the header.
306+
The file start address of the header, in bytes.
299307
300-
data_offset: `int`
301-
The start word in the file of the data.
308+
data_offset: `int`, optional
309+
The file start address of the data, in bytes. If
310+
`None`, the default, then the data offset will be
311+
calculated from the integer header.
302312
303313
disk_length: `int`
304-
The length in words of the data in the file.
314+
The length in bytes of the data in the file. If
315+
`None`, the default, then the disk length will be
316+
calculated from the integer header.
305317
306318
:Returns:
307319
308320
`Rec`
309321
310322
"""
311323
c = file._c_interface
324+
word_size = file.word_size
312325
int_hdr, real_hdr = c.read_header(
313-
file.fd, hdr_offset, file.byte_ordering, file.word_size
326+
file.fd, hdr_offset, file.byte_ordering, word_size
314327
)
315328

329+
if data_offset is None:
330+
# Calculate the data offset from the integer header
331+
if file.fmt == "PP":
332+
# We only support 64-word headers, so the data starts
333+
# 66 words after the header_offset, i.e. after 64
334+
# words of the header, plus 2 block control words.
335+
data_offset = hdr_offset + 66 * word_size
336+
else:
337+
# Fields file
338+
data_offset = int_hdr[LBEGIN] * word_size
339+
340+
if disk_length is None:
341+
# Calculate the disk length from the integer header
342+
disk_length = int_hdr[LBLREC]
343+
if int_hdr[LBPACK] % 10 == 2:
344+
# Cray 32-bit packing
345+
disk_length = disk_length * 4
346+
else:
347+
disk_length = disk_length * word_size
348+
316349
return cls(
317-
int_hdr, real_hdr, hdr_offset, data_offset, disk_length, file=file
350+
int_hdr,
351+
real_hdr,
352+
hdr_offset,
353+
data_offset,
354+
disk_length,
355+
file=file,
318356
)
319357

320358
def read_extra_data(self):
@@ -325,8 +363,8 @@ def read_extra_data(self):
325363
`numpy.ndarray`
326364
327365
"""
328-
c = self.file._c_interface
329366
file = self.file
367+
c = file._c_interface
330368

331369
(
332370
extra_data_offset,
@@ -389,17 +427,18 @@ def get_data(self):
389427
`numpy.ndarray`
390428
391429
"""
392-
c = self.file._c_interface
393430
file = self.file
394-
data_type, nwords = c.get_type_and_num_words(self.int_hdr)
431+
c = file._c_interface
432+
int_hdr = self.int_hdr
433+
data_type, nwords = c.get_type_and_num_words(int_hdr)
395434

396435
return c.read_record_data(
397436
file.fd,
398437
self.data_offset,
399438
self.disk_length,
400439
file.byte_ordering,
401440
file.word_size,
402-
self.int_hdr,
441+
int_hdr,
403442
self.real_hdr,
404443
data_type,
405444
nwords,

0 commit comments

Comments
 (0)