Skip to content

Commit b93e2d3

Browse files
committed
dev
1 parent 0f1e820 commit b93e2d3

File tree

6 files changed

+69
-37
lines changed

6 files changed

+69
-37
lines changed

Changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ version NEXT
99
to regrid the vertical axis in logarithmic coordinates to
1010
`cf.Field.regrids` and `cf.Field.regridc`
1111
(https://github.com/NCAS-CMS/cf-python/issues/715)
12+
* Improve the performance of reading and accessing the data of PP and
13+
UM fields files (https://github.com/NCAS-CMS/cf-python/issues/7??)
1214
* Improve `cf.Field.collapse` performance by lazily computing reduced
1315
axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
1416
* Fix misleading error message when it is not possible to create area

cf/data/array/umarray.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,12 @@ def _get_rec(self, f, header_offset):
276276

277277
# ------------------------------------------------------------
278278
# Leave the following commented code here for debugging
279-
# purposes. If you replace the above line with this commented
280-
# code, then you must aslo set 'parse=True' in the `open`
281-
# method.
279+
# purposes. Replacing the above line with this code moves the
280+
# calculation of the data offset and disk length from pure
281+
# Python to the C library, at the expense of completely
282+
# parsing the file. Note: If you do replace the above line
283+
# with the commented code, then you *must* also set
284+
# 'parse=True' in the `open` method.
282285
# ------------------------------------------------------------
283286

284287
# for v in f.vars:
@@ -672,16 +675,18 @@ def get_word_size(self):
672675
return self._get_component("word_size", None)
673676

674677
def open(self):
675-
"""Returns an open dataset containing the data array.
678+
"""Returns an open dataset and the address of the data.
676679
677680
:Returns:
678681
679-
`umfile_lib.File`, `int`
682+
`umfile_lib.umfile.File`, `int`
683+
The open file object, and the start address in bytes
684+
of the lookup header.
680685
681686
**Examples**
682687
683688
>>> f.open()
684-
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 44567)
689+
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 4)
685690
686691
"""
687692
return super().open(

cf/read_write/um/umread.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3439,7 +3439,7 @@ def read(
34393439
else:
34403440
byte_ordering = None
34413441

3442-
f = self.file_open(filename)
3442+
f = self.file_open(filename, parse=True)
34433443

34443444
info = is_log_level_info(logger)
34453445

@@ -3472,7 +3472,7 @@ def _open_um_file(
34723472
fmt=None,
34733473
word_size=None,
34743474
byte_ordering=None,
3475-
parse=True
3475+
parse=True,
34763476
):
34773477
"""Open a UM fields file or PP file.
34783478
@@ -3481,10 +3481,18 @@ def _open_um_file(
34813481
filename: `str`
34823482
The file to be opened.
34833483
3484+
parse: `bool`, optional
3485+
If True, the default, then parse the contents. If
3486+
False then the contents are not parsed, which can be
3487+
considerable faster in cases when the contents are not
3488+
required.
3489+
3490+
.. versionadded:: NEXTVERSION
3491+
34843492
:Returns:
34853493
3486-
`umread.umfile.File`
3487-
The opened file with an open file descriptor.
3494+
`umread_lib.umfile.File`
3495+
The open PP or FF file object.
34883496
34893497
"""
34903498
self.file_close()
@@ -3494,7 +3502,7 @@ def _open_um_file(
34943502
byte_ordering=byte_ordering,
34953503
word_size=word_size,
34963504
fmt=fmt,
3497-
parse=parse
3505+
parse=parse,
34983506
)
34993507
except Exception as error:
35003508
try:
@@ -3529,8 +3537,10 @@ def is_um_file(self, filename):
35293537
35303538
"""
35313539
try:
3540+
# Note: No need to completely parse the file to ascertain
3541+
# if it's PP of FF.
35323542
self.file_open(filename, parse=False)
3533-
except Exception as error:
3543+
except Exception:
35343544
self.file_close()
35353545
return False
35363546
else:
@@ -3559,8 +3569,19 @@ def file_open(self, filename, parse=True):
35593569
filename: `str`
35603570
The file to be read.
35613571
3572+
parse: `bool`, optional
3573+
If True, the default, then parse the contents. If
3574+
False then the contents are not parsed, which can be
3575+
considerable faster in cases when the contents are not
3576+
required.
3577+
3578+
.. versionadded:: NEXTVERSION
3579+
35623580
:Returns:
35633581
3582+
`umread_lib.umfile.File`
3583+
The open PP or FF file object.
3584+
35643585
"""
35653586
g = getattr(self, "read_vars", {})
35663587

@@ -3569,7 +3590,7 @@ def file_open(self, filename, parse=True):
35693590
byte_ordering=g.get("byte_ordering"),
35703591
word_size=g.get("word_size"),
35713592
fmt=g.get("fmt"),
3572-
parse=parse
3593+
parse=parse,
35733594
)
35743595

35753596

cf/test/test_pp.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self):
112112

113113
f = cf.read(self.ppfile)[0]
114114

115-
# TODO: reinstate "CFA4" at version>3.14
116-
for fmt in ("NETCDF4",): # "CFA4"):
117-
cf.write(f, tmpfile, fmt=fmt)
115+
for cfa in (False, True):
116+
cf.write(f, tmpfile, cfa=cfa)
118117
g = cf.read(tmpfile)[0]
119118

120119
self.assertTrue((f.array == array).all())

cf/test/test_read_write.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ def test_write_netcdf_mode(self):
551551

552552
def test_read_write_netCDF4_compress_shuffle(self):
553553
f = cf.read(self.filename)[0]
554-
# TODO: reinstate "CFA4" at version > 3.14
554+
# TODODASK: reinstate "CFA4" at version > 3.14
555555
for fmt in ("NETCDF4", "NETCDF4_CLASSIC"): # , "CFA4"):
556556
cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True)
557557
g = cf.read(tmpfile)[0]
@@ -920,6 +920,7 @@ def test_write_omit_data(self):
920920
self.assertFalse(g.array.count())
921921
self.assertTrue(g.construct("grid_latitude").array.count())
922922

923+
@unittest.skipUnless(False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION")
923924
def test_read_url(self):
924925
"""Test reading urls."""
925926
for scheme in ("http", "https"):

cf/umread_lib/umfile.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
class UMFileException(Exception):
1111
pass
1212

13-
# Integer header pointers
14-
LBLREC = 14
15-
LBPACK = 20
16-
LBEGIN = 28
13+
14+
# Lookup header pointers
15+
LBLREC = 14 # Length of data record (including any extra data)
16+
LBPACK = 20 # Packing method indicator
17+
LBEGIN = 28 # Disk address/Start Record
18+
1719

1820
class File:
1921
"""A class for a UM file that gives a view of the file including
@@ -286,21 +288,21 @@ def __init__(
286288

287289
@classmethod
288290
def from_file_and_offsets(
289-
cls, file, header_offset, data_offset=None, disk_length=None
291+
cls, file, hdr_offset, data_offset=None, disk_length=None
290292
):
291293
"""Instantiate a `Rec` object from the `File` object and the
292294
header and data offsets.
293295
294-
The headers are read in, and also the record object is ready for
295-
calling `get_data`.
296+
The lookup header are read in immediately, and the returned
297+
record object is ready for calling `get_data`.
296298
297299
:Parameters:
298300
299301
file: `File`
300302
A view of a file including sets of PP records combined
301303
into variables.
302304
303-
header_offset: `int`
305+
hdr_offset: `int`
304306
The file start address of the header, in bytes.
305307
306308
data_offset: `int`, optional
@@ -311,7 +313,7 @@ def from_file_and_offsets(
311313
disk_length: `int`
312314
The length in bytes of the data in the file. If
313315
`None`, the default, then the disk length will be
314-
calculated from the integer header.
316+
calculated from the integer.
315317
316318
:Returns:
317319
@@ -321,32 +323,33 @@ def from_file_and_offsets(
321323
c = file._c_interface
322324
word_size = file.word_size
323325
int_hdr, real_hdr = c.read_header(
324-
file.fd, header_offset, file.byte_ordering, word_size
326+
file.fd, hdr_offset, file.byte_ordering, word_size
325327
)
326328

327329
if data_offset is None:
328330
# Calculate the data offset from the integer header
329331
if file.fmt == "PP":
330332
# We only support 64-word headers, so the data starts
331-
# 66 words after the header_offset, i.e. 64 words of the
332-
# header, plus 2 block control words.
333-
data_offset = header_offset + 66 * word_size
333+
# 66 words after the header_offset, i.e. after 64
334+
# words of the header, plus 2 block control words.
335+
data_offset = hdr_offset + 66 * word_size
334336
else:
335337
# Fields file
336338
data_offset = int_hdr[LBEGIN] * word_size
337339

338340
if disk_length is None:
339341
# Calculate the disk length from the integer header
342+
disk_length = int_hdr[LBLREC]
340343
if int_hdr[LBPACK] % 10 == 2:
341344
# Cray 32-bit packing
342-
disk_length = int_hdr[LBLREC] * 4
345+
disk_length = disk_length * 4
343346
else:
344-
disk_length = int_hdr[LBLREC] * word_size
347+
disk_length = disk_length * word_size
345348

346349
return cls(
347350
int_hdr,
348351
real_hdr,
349-
header_offset,
352+
hdr_offset,
350353
data_offset,
351354
disk_length,
352355
file=file,
@@ -360,8 +363,8 @@ def read_extra_data(self):
360363
`numpy.ndarray`
361364
362365
"""
363-
c = self.file._c_interface
364366
file = self.file
367+
c = file._c_interface
365368

366369
(
367370
extra_data_offset,
@@ -424,17 +427,18 @@ def get_data(self):
424427
`numpy.ndarray`
425428
426429
"""
427-
c = self.file._c_interface
428430
file = self.file
429-
data_type, nwords = c.get_type_and_num_words(self.int_hdr)
431+
c = file._c_interface
432+
int_hdr = self.int_hdr
433+
data_type, nwords = c.get_type_and_num_words(int_hdr)
430434

431435
return c.read_record_data(
432436
file.fd,
433437
self.data_offset,
434438
self.disk_length,
435439
file.byte_ordering,
436440
file.word_size,
437-
self.int_hdr,
441+
int_hdr,
438442
self.real_hdr,
439443
data_type,
440444
nwords,

0 commit comments

Comments
 (0)