diff --git a/nibabel/externals/netcdf.py b/nibabel/externals/netcdf.py index aca62aac80..30b30f5a7f 100644 --- a/nibabel/externals/netcdf.py +++ b/nibabel/externals/netcdf.py @@ -10,12 +10,16 @@ NetCDF files. The same API is also used in the PyNIO and pynetcdf modules, allowing these modules to be used interchangeably when working with NetCDF files. + +Only NetCDF3 is supported here; for NetCDF4 see +`netCDF4-python `__, +which has a similar API. + """ # TODO: # * properly implement ``_FillValue``. -# * implement Jeff Whitaker's patch for masked variables. # * fix character variables. # * implement PAGESIZE for Python 2.6? @@ -28,18 +32,24 @@ # otherwise the key would be inserted into userspace attributes. -__all__ = ['netcdf_file'] +__all__ = ['netcdf_file', 'netcdf_variable'] +import sys +import warnings +import weakref from operator import mul -from mmap import mmap, ACCESS_READ +from collections import OrderedDict -import numpy as np # noqa -from numpy.compat.py3k import asbytes, asstr -from numpy import frombuffer, ndarray, dtype, empty, array, asarray +import mmap as mm + +import numpy as np +from numpy.compat import asbytes, asstr +from numpy import frombuffer, dtype, empty, array, asarray from numpy import little_endian as LITTLE_ENDIAN from functools import reduce +IS_PYPY = ('__pypy__' in sys.modules) ABSENT = b'\x00\x00\x00\x00\x00\x00\x00\x00' ZERO = b'\x00\x00\x00\x00' @@ -52,27 +62,39 @@ NC_DIMENSION = b'\x00\x00\x00\n' NC_VARIABLE = b'\x00\x00\x00\x0b' NC_ATTRIBUTE = b'\x00\x00\x00\x0c' - +FILL_BYTE = b'\x81' +FILL_CHAR = b'\x00' +FILL_SHORT = b'\x80\x01' +FILL_INT = b'\x80\x00\x00\x01' +FILL_FLOAT = b'\x7C\xF0\x00\x00' +FILL_DOUBLE = b'\x47\x9E\x00\x00\x00\x00\x00\x00' TYPEMAP = {NC_BYTE: ('b', 1), - NC_CHAR: ('c', 1), - NC_SHORT: ('h', 2), - NC_INT: ('i', 4), - NC_FLOAT: ('f', 4), - NC_DOUBLE: ('d', 8)} + NC_CHAR: ('c', 1), + NC_SHORT: ('h', 2), + NC_INT: ('i', 4), + NC_FLOAT: ('f', 4), + NC_DOUBLE: ('d', 8)} + +FILLMAP = {NC_BYTE: FILL_BYTE, + NC_CHAR: FILL_CHAR, + NC_SHORT: FILL_SHORT, + NC_INT: FILL_INT, + NC_FLOAT: FILL_FLOAT, + NC_DOUBLE: FILL_DOUBLE} REVERSE = {('b', 1): NC_BYTE, - ('B', 1): NC_CHAR, - ('c', 1): NC_CHAR, - ('h', 2): NC_SHORT, - ('i', 4): NC_INT, - ('f', 4): NC_FLOAT, - ('d', 8): NC_DOUBLE, + ('B', 1): NC_CHAR, + ('c', 1): NC_CHAR, + ('h', 2): NC_SHORT, + ('i', 4): NC_INT, + ('f', 4): NC_FLOAT, + ('d', 8): NC_DOUBLE, - # these come from asarray(1).dtype.char and asarray('foo').dtype.char, - # used when getting the types from generic attributes. - ('l', 4): NC_INT, - ('S', 1): NC_CHAR} + # these come from asarray(1).dtype.char and asarray('foo').dtype.char, + # used when getting the types from generic attributes. + ('l', 4): NC_INT, + ('S', 1): NC_CHAR} class netcdf_file(object): @@ -93,17 +115,22 @@ class netcdf_file(object): ---------- filename : string or file-like string -> filename - mode : {'r', 'w'}, optional - read-write mode, default is 'r' + mode : {'r', 'w', 'a'}, optional + read-write-append mode, default is 'r' mmap : None or bool, optional Whether to mmap `filename` when reading. Default is True when `filename` is a file name, False when `filename` is a - file-like object + file-like object. Note that when mmap is in use, data arrays + returned refer directly to the mmapped data on disk, and the + file cannot be closed as long as references to it exist. version : {1, 2}, optional version of netcdf to read / write, where 1 means *Classic format* and 2 means *64-bit offset format*. Default is 1. See - `here `_ + `here `__ for more info. + maskandscale : bool, optional + Whether to automatically scale and/or mask data based on attributes. + Default is False. Notes ----- @@ -114,7 +141,7 @@ class netcdf_file(object): NetCDF files are a self-describing binary data format. The file contains metadata that describes the dimensions and variables in the file. More details about NetCDF files can be found `here - `_. There + `__. There are three main sections to a NetCDF data structure: 1. Dimensions @@ -142,6 +169,13 @@ class netcdf_file(object): unnecessary data into memory. It uses the ``mmap`` module to create Numpy arrays mapped to the data on disk, for the same purpose. + Note that when `netcdf_file` is used to open a file with mmap=True + (default for read-only), arrays returned by it refer to data + directly on the disk. The file should not be closed, and cannot be cleanly + closed when asked, if such arrays are alive. You may want to copy data arrays + obtained from mmapped Netcdf file if they are to be processed after the file + is closed, see the example below. + Examples -------- To create a NetCDF file: @@ -163,9 +197,9 @@ class netcdf_file(object): >>> time.units = 'days since 2008-01-01' >>> f.close() - Note the assignment of ``range(10)`` to ``time[:]``. Exposing the slice + Note the assignment of ``arange(10)`` to ``time[:]``. Exposing the slice of the time variable allows for the data to be set in the object, rather - than letting ``range(10)`` overwrite the ``time`` variable. + than letting ``arange(10)`` overwrite the ``time`` variable. To read the NetCDF file we just created: @@ -179,7 +213,22 @@ class netcdf_file(object): True >>> time[-1] 9 + + NetCDF files, when opened read-only, return arrays that refer + directly to memory-mapped data on disk: + + >>> data = time[:] + >>> data.base.base # doctest: +ELLIPSIS + + + If the data is to be processed after the file is closed, it needs + to be copied to main memory: + + >>> data = time[:].copy() + >>> del time # References to mmap'd objects can delay full closure >>> f.close() + >>> data.mean() + 4.5 A NetCDF file can also be used as context manager: @@ -189,12 +238,16 @@ class netcdf_file(object): Delete our temporary directory and file: - >>> del f, time # needed for windows unlink + >>> del f # needed for windows unlink >>> os.unlink(fname) >>> os.rmdir(tmp_pth) """ - def __init__(self, filename, mode='r', mmap=None, version=1): + def __init__(self, filename, mode='r', mmap=None, version=1, + maskandscale=False): """Initialize netcdf_file from fileobj (str or file-like).""" + if mode not in 'rwa': + raise ValueError("Mode must be either 'r', 'w' or 'a'.") + if hasattr(filename, 'seek'): # file-like self.fp = filename self.filename = 'None' @@ -204,34 +257,39 @@ def __init__(self, filename, mode='r', mmap=None, version=1): raise ValueError('Cannot use file object for mmap') else: # maybe it's a string self.filename = filename - self.fp = open(self.filename, '%sb' % mode) + omode = 'r+' if mode == 'a' else mode + self.fp = open(self.filename, '%sb' % omode) if mmap is None: - mmap = True - try: - self.fp.seek(0, 2) - except ValueError: - self.file_bytes = -1 # Unknown file length (gzip). - else: - self.file_bytes = self.fp.tell() - self.fp.seek(0) + # Mmapped files on PyPy cannot be usually closed + # before the GC runs, so it's better to use mmap=False + # as the default. + mmap = (not IS_PYPY) - self.use_mmap = mmap - self.version_byte = version + if mode != 'r': + # Cannot read write-only files + mmap = False - if not mode in 'rw': - raise ValueError("Mode must be either 'r' or 'w'.") + self.use_mmap = mmap self.mode = mode + self.version_byte = version + self.maskandscale = maskandscale - self.dimensions = {} - self.variables = {} + self.dimensions = OrderedDict() + self.variables = OrderedDict() self._dims = [] self._recs = 0 self._recsize = 0 - self._attributes = {} + self._mm = None + self._mm_buf = None + if self.use_mmap: + self._mm = mm.mmap(self.fp.fileno(), 0, access=mm.ACCESS_READ) + self._mm_buf = np.frombuffer(self._mm, dtype=np.int8) - if mode == 'r': + self._attributes = OrderedDict() + + if mode in 'ra': self._read() def __setattr__(self, attr, value): @@ -245,10 +303,28 @@ def __setattr__(self, attr, value): def close(self): """Closes the NetCDF file.""" - if not self.fp.closed: + if hasattr(self, 'fp') and not self.fp.closed: try: self.flush() finally: + self.variables = OrderedDict() + if self._mm_buf is not None: + ref = weakref.ref(self._mm_buf) + self._mm_buf = None + if ref() is None: + # self._mm_buf is gc'd, and we can close the mmap + self._mm.close() + else: + # we cannot close self._mm, since self._mm_buf is + # alive and there may still be arrays referring to it + warnings.warn(( + "Cannot close a netcdf_file opened with mmap=True, when " + "netcdf_variables or arrays referring to its data still exist. " + "All data arrays obtained from such files refer directly to " + "data on disk, and must be copied before the file can be cleanly " + "closed. (See netcdf_file docstring for more information on mmap.)" + ), category=RuntimeWarning) + self._mm = None self.fp.close() __del__ = close @@ -278,6 +354,9 @@ def createDimension(self, name, length): createVariable """ + if length is None and self._dims: + raise ValueError("Only first dimension may be unlimited!") + self.dimensions[name] = length self._dims.append(name) @@ -321,7 +400,9 @@ def createVariable(self, name, type, dimensions): raise ValueError("NetCDF 3 does not support type %s" % type) data = empty(shape_, dtype=type.newbyteorder("B")) # convert to big endian always for NetCDF 3 - self.variables[name] = netcdf_variable(data, typecode, size, shape, dimensions) + self.variables[name] = netcdf_variable( + data, typecode, size, shape, dimensions, + maskandscale=self.maskandscale) return self.variables[name] def flush(self): @@ -333,7 +414,7 @@ def flush(self): sync : Identical function """ - if hasattr(self, 'mode') and self.mode is 'w': + if hasattr(self, 'mode') and self.mode in 'wa': self._write() sync = flush @@ -375,7 +456,7 @@ def _write_att_array(self, attributes): self._pack_int(len(attributes)) for name, values in attributes.items(): self._pack_string(name) - self._write_values(values) + self._write_att_values(values) else: self.fp.write(ABSENT) @@ -384,11 +465,13 @@ def _write_var_array(self): self.fp.write(NC_VARIABLE) self._pack_int(len(self.variables)) - # Sort variables non-recs first, then recs. We use a DSU - # since some people use pupynere with Python 2.3.x. - deco = [(v._shape and not v.isrec, k) for (k, v) in self.variables.items()] - deco.sort() - variables = [k for (unused, k) in deco][::-1] + # Sort variable names non-recs first, then recs. + def sortkey(n): + v = self.variables[n] + if v.isrec: + return (-1,) + return v._shape + variables = sorted(self.variables, key=sortkey, reverse=True) # Set the metadata for all variables. for name in variables: @@ -426,8 +509,8 @@ def _write_var_metadata(self, name): vsize = var.data[0].size * var.data.itemsize except IndexError: vsize = 0 - rec_vars = len([var for var in self.variables.values() - if var.isrec]) + rec_vars = len([v for v in self.variables.values() + if v.isrec]) if rec_vars > 1: vsize += -vsize % 4 self.variables[name].__dict__['_vsize'] = vsize @@ -450,12 +533,17 @@ def _write_var_data(self, name): if not var.isrec: self.fp.write(var.data.tostring()) count = var.data.size * var.data.itemsize - self.fp.write(b'0' * (var._vsize - count)) + self._write_var_padding(var, var._vsize - count) else: # record variable # Handle rec vars with shape[0] < nrecs. if self._recs > len(var.data): shape = (self._recs,) + var.data.shape[1:] - var.data.resize(shape) + # Resize in-place does not always work since + # the array might not be single-segment + try: + var.data.resize(shape) + except ValueError: + var.__dict__['data'] = np.resize(var.data, shape).astype(var.data.dtype) pos0 = pos = self.fp.tell() for rec in var.data: @@ -468,30 +556,42 @@ def _write_var_data(self, name): self.fp.write(rec.tostring()) # Padding count = rec.size * rec.itemsize - self.fp.write(b'0' * (var._vsize - count)) + self._write_var_padding(var, var._vsize - count) pos += self._recsize self.fp.seek(pos) self.fp.seek(pos0 + var._vsize) - def _write_values(self, values): + def _write_var_padding(self, var, size): + encoded_fill_value = var._get_encoded_fill_value() + num_fills = size // len(encoded_fill_value) + self.fp.write(encoded_fill_value * num_fills) + + def _write_att_values(self, values): if hasattr(values, 'dtype'): nc_type = REVERSE[values.dtype.char, values.dtype.itemsize] else: types = [ (int, NC_INT), (float, NC_FLOAT), - (str, NC_CHAR), + (str, NC_CHAR) ] - try: - sample = values[0] - except TypeError: + # bytes index into scalars in py3k. Check for "string" types + if isinstance(values, (str, bytes)): sample = values + else: + try: + sample = values[0] # subscriptable? + except TypeError: + sample = values # scalar + for class_, nc_type in types: if isinstance(sample, class_): break typecode, size = TYPEMAP[nc_type] dtype_ = '>%s' % typecode + # asarray() dies with bytes and '>c' in py3k. Change to 'S' + dtype_ = 'S' if dtype_ == '>c' else dtype_ values = asarray(values, dtype=dtype_) @@ -508,7 +608,7 @@ def _write_values(self, values): values = values.byteswap() self.fp.write(values.tostring()) count = values.size * values.itemsize - self.fp.write(b'0' * (-count % 4)) # pad + self.fp.write(b'\x00' * (-count % 4)) # pad def _read(self): # Check magic bytes and version @@ -529,7 +629,7 @@ def _read_numrecs(self): def _read_dim_array(self): header = self.fp.read(4) - if not header in [ZERO, NC_DIMENSION]: + if header not in [ZERO, NC_DIMENSION]: raise ValueError("Unexpected header.") count = self._unpack_int() @@ -545,19 +645,19 @@ def _read_gatt_array(self): def _read_att_array(self): header = self.fp.read(4) - if not header in [ZERO, NC_ATTRIBUTE]: + if header not in [ZERO, NC_ATTRIBUTE]: raise ValueError("Unexpected header.") count = self._unpack_int() - attributes = {} + attributes = OrderedDict() for attr in range(count): name = asstr(self._unpack_string()) - attributes[name] = self._read_values() + attributes[name] = self._read_att_values() return attributes def _read_var_array(self): header = self.fp.read(4) - if not header in [ZERO, NC_VARIABLE]: + if header not in [ZERO, NC_VARIABLE]: raise ValueError("Unexpected header.") begin = 0 @@ -567,7 +667,7 @@ def _read_var_array(self): for var in range(count): (name, dimensions, shape, attributes, typecode, size, dtype_, begin_, vsize) = self._read_var() - # https://www.unidata.ucar.edu/software/netcdf/docs/netcdf.html + # https://www.unidata.ucar.edu/software/netcdf/docs/user_guide.html # Note that vsize is the product of the dimension lengths # (omitting the record dimension) and the number of bytes # per value (determined from the type), increased to the @@ -604,28 +704,21 @@ def _read_var_array(self): else: # not a record variable # Calculate size to avoid problems with vsize (above) a_size = reduce(mul, shape, 1) * size - if self.file_bytes >= 0 and begin_ + a_size > self.file_bytes: - data = frombuffer(b'\x00'*a_size, dtype=dtype_) - elif self.use_mmap: - mm = mmap(self.fp.fileno(), begin_+a_size, access=ACCESS_READ) - data = ndarray.__new__(ndarray, shape, dtype=dtype_, - buffer=mm, offset=begin_, order=0) + if self.use_mmap: + data = self._mm_buf[begin_:begin_+a_size].view(dtype=dtype_) + data.shape = shape else: pos = self.fp.tell() self.fp.seek(begin_) - # Try to read file, which may fail because the data is - # at or past the end of file. In that case, we treat - # this data as zeros. - buf = self.fp.read(a_size) - if len(buf) < a_size: - buf = b'\x00'*a_size - data = frombuffer(buf, dtype=dtype_) + data = frombuffer(self.fp.read(a_size), dtype=dtype_ + ).copy() data.shape = shape self.fp.seek(pos) # Add variable. self.variables[name] = netcdf_variable( - data, typecode, size, shape, dimensions, attributes) + data, typecode, size, shape, dimensions, attributes, + maskandscale=self.maskandscale) if rec_vars: # Remove padding when only one record variable. @@ -635,13 +728,13 @@ def _read_var_array(self): # Build rec array. if self.use_mmap: - mm = mmap(self.fp.fileno(), begin+self._recs*self._recsize, access=ACCESS_READ) - rec_array = ndarray.__new__(ndarray, (self._recs,), dtype=dtypes, - buffer=mm, offset=begin, order=0) + rec_array = self._mm_buf[begin:begin+self._recs*self._recsize].view(dtype=dtypes) + rec_array.shape = (self._recs,) else: pos = self.fp.tell() self.fp.seek(begin) - rec_array = frombuffer(self.fp.read(self._recs*self._recsize), dtype=dtypes) + rec_array = frombuffer(self.fp.read(self._recs*self._recsize), + dtype=dtypes).copy() rec_array.shape = (self._recs,) self.fp.seek(pos) @@ -673,7 +766,7 @@ def _read_var(self): return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize - def _read_values(self): + def _read_att_values(self): nc_type = self.fp.read(4) n = self._unpack_int() @@ -684,7 +777,7 @@ def _read_values(self): self.fp.read(-count % 4) # read padding if typecode is not 'c': - values = frombuffer(values, dtype='>%s' % typecode) + values = frombuffer(values, dtype='>%s' % typecode).copy() if values.shape == (1,): values = values[0] else: @@ -715,7 +808,7 @@ def _pack_string(self, s): count = len(s) self._pack_int(count) self.fp.write(asbytes(s)) - self.fp.write(b'0' * (-count % 4)) # pad + self.fp.write(b'\x00' * (-count % 4)) # pad def _unpack_string(self): count = self._unpack_int() @@ -726,7 +819,7 @@ def _unpack_string(self): class netcdf_variable(object): """ - A data object for the `netcdf` module. + A data object for netcdf files. `netcdf_variable` objects are constructed by calling the method `netcdf_file.createVariable` on the `netcdf_file` object. `netcdf_variable` @@ -760,6 +853,9 @@ class netcdf_variable(object): attributes : dict, optional Attribute values (any type) keyed by string names. These attributes become attributes for the netcdf_variable object. + maskandscale : bool, optional + Whether to automatically scale and/or mask data based on attributes. + Default is False. Attributes @@ -774,14 +870,17 @@ class netcdf_variable(object): isrec, shape """ - def __init__(self, data, typecode, size, shape, dimensions, attributes=None): + def __init__(self, data, typecode, size, shape, dimensions, + attributes=None, + maskandscale=False): self.data = data self._typecode = typecode self._size = size self._shape = shape self.dimensions = dimensions + self.maskandscale = maskandscale - self._attributes = attributes or {} + self._attributes = attributes or OrderedDict() for k, v in self._attributes.items(): self.__dict__[k] = v @@ -803,7 +902,7 @@ def isrec(self): `netcdf_variable`. """ - return self.data.shape and not self._shape[0] + return bool(self.data.shape) and not self._shape[0] isrec = property(isrec) def shape(self): @@ -880,9 +979,36 @@ def itemsize(self): return self._size def __getitem__(self, index): - return self.data[index] + if not self.maskandscale: + return self.data[index] + + data = self.data[index].copy() + missing_value = self._get_missing_value() + data = self._apply_missing_value(data, missing_value) + scale_factor = self._attributes.get('scale_factor') + add_offset = self._attributes.get('add_offset') + if add_offset is not None or scale_factor is not None: + data = data.astype(np.float64) + if scale_factor is not None: + data = data * scale_factor + if add_offset is not None: + data += add_offset + + return data def __setitem__(self, index, data): + if self.maskandscale: + missing_value = ( + self._get_missing_value() or + getattr(data, 'fill_value', 999999)) + self._attributes.setdefault('missing_value', missing_value) + self._attributes.setdefault('_FillValue', missing_value) + data = ((data - self._attributes.get('add_offset', 0.0)) / + self._attributes.get('scale_factor', 1.0)) + data = np.ma.asarray(data).filled(missing_value) + if self._typecode not in 'fd' and data.dtype.kind == 'f': + data = np.round(data) + # Expand data for record vars? if self.isrec: if isinstance(index, tuple): @@ -895,9 +1021,86 @@ def __setitem__(self, index, data): recs = rec_index + 1 if recs > len(self.data): shape = (recs,) + self._shape[1:] - self.data.resize(shape) + # Resize in-place does not always work since + # the array might not be single-segment + try: + self.data.resize(shape) + except ValueError: + self.__dict__['data'] = np.resize(self.data, shape).astype(self.data.dtype) self.data[index] = data + def _default_encoded_fill_value(self): + """ + The default encoded fill-value for this Variable's data type. + """ + nc_type = REVERSE[self.typecode(), self.itemsize()] + return FILLMAP[nc_type] + + def _get_encoded_fill_value(self): + """ + Returns the encoded fill value for this variable as bytes. + + This is taken from either the _FillValue attribute, or the default fill + value for this variable's data type. + """ + if '_FillValue' in self._attributes: + fill_value = np.array(self._attributes['_FillValue'], + dtype=self.data.dtype).tostring() + if len(fill_value) == self.itemsize(): + return fill_value + else: + return self._default_encoded_fill_value() + else: + return self._default_encoded_fill_value() + + def _get_missing_value(self): + """ + Returns the value denoting "no data" for this variable. + + If this variable does not have a missing/fill value, returns None. + + If both _FillValue and missing_value are given, give precedence to + _FillValue. The netCDF standard gives special meaning to _FillValue; + missing_value is just used for compatibility with old datasets. + """ + + if '_FillValue' in self._attributes: + missing_value = self._attributes['_FillValue'] + elif 'missing_value' in self._attributes: + missing_value = self._attributes['missing_value'] + else: + missing_value = None + + return missing_value + + @staticmethod + def _apply_missing_value(data, missing_value): + """ + Applies the given missing value to the data array. + + Returns a numpy.ma array, with any value equal to missing_value masked + out (unless missing_value is None, in which case the original array is + returned). + """ + + if missing_value is None: + newdata = data + else: + try: + missing_value_isnan = np.isnan(missing_value) + except (TypeError, NotImplementedError): + # some data types (e.g., characters) cannot be tested for NaN + missing_value_isnan = False + + if missing_value_isnan: + mymask = np.isnan(data) + else: + mymask = (data == missing_value) + + newdata = np.ma.masked_where(mymask, data) + + return newdata + NetCDFFile = netcdf_file NetCDFVariable = netcdf_variable