pydata · DevDaoud · Feb 7, 2019 · Feb 7, 2019 · Feb 7, 2019 · Feb 7, 2019
diff --git a/.gitignore b/.gitignore
@@ -68,3 +68,4 @@ xarray/version.py
 Icon*
 
 .ipynb_checkpoints
+.nfs*
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -68,6 +68,9 @@ Enhancements
 - :py:meth:`pandas.Series.dropna` is now supported for a
   :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex`
   (:issue:`2688`). By `Spencer Clark <https://github.com/spencerkclark>`_.
+- Variables are now unpacked with scale_factor and offset dtypes if present in datasets.
+  According `to cf convetion <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html>`_.
+  By `Daoud Jahdou <https://github.com/daoudjahdou>`_.
 - :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which
   can be used to require that ``cftime.datetime`` objects are always used, or
   never used when decoding dates encoded with a standard calendar.  This can be

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -189,8 +189,37 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
     return data
 
 
-def _choose_float_dtype(dtype, has_offset):
+def _choose_float_dtype(dtype, scale_factor, add_offset, mode=None):
     """Return a float dtype that can losslessly represent `dtype` values."""
+    # Implementing cf-convention
+    # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html:
+    # Detail:
+    # If the scale_factor and add_offset attributes are of the same
+    # data type as the
+    # associated variable, the unpacked data is assumed to be of the same
+    # data type as the packed data. However, if the scale_factor
+    # and add_offset attributes are of a different data type
+    # from the variable (containing the packed data)
+    # then the unpacked data should match
+    # the type of these attributes, which must both be of type float or both
+    # be of type double. An additional restriction in this case is that
+    # the variable containing the packed data must
+    # be of type byte, short or int.
+    # It is not advised to unpack an int into a float as there
+    # is a potential precision loss.
+
+    if mode is 'decoding':
+        types = [np.dtype(type(scale_factor)),
+                 np.dtype(type(add_offset)),
+                 np.dtype(dtype)]
+        # scaled_type should be the largest type we find
+        scaled_dtype = max(types)
+
+        # We return it only if it's a float32 or a float64
+        if (scaled_dtype.itemsize >= 4
+                and np.issubdtype(scaled_dtype, np.floating)):
+            return scaled_dtype
+
     # Keep float32 as-is.  Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
     if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
@@ -201,7 +230,7 @@ def _choose_float_dtype(dtype, has_offset):
         # but a large integer offset could lead to loss of precision.
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
-        if not has_offset:
+        if not add_offset:
             return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
@@ -217,14 +246,18 @@ class CFScaleOffsetCoder(VariableCoder):
 
     def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
-
         if 'scale_factor' in encoding or 'add_offset' in encoding:
-            dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
+            scale_factor = pop_to(encoding, attrs,
+                                  'scale_factor', name=name)
+            add_offset = pop_to(encoding, attrs,
+                                'add_offset', name=name)
+            dtype = _choose_float_dtype(data.dtype, scale_factor,
+                                        add_offset, mode='encoding')
             data = data.astype(dtype=dtype, copy=True)
-            if 'add_offset' in encoding:
-                data -= pop_to(encoding, attrs, 'add_offset', name=name)
-            if 'scale_factor' in encoding:
-                data /= pop_to(encoding, attrs, 'scale_factor', name=name)
+            if add_offset:
+                data -= add_offset
+            if scale_factor:
+                data /= scale_factor
 
         return Variable(dims, data, attrs, encoding)
 
@@ -234,7 +267,8 @@ def decode(self, variable, name=None):
         if 'scale_factor' in attrs or 'add_offset' in attrs:
             scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
             add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
-            dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
+            dtype = _choose_float_dtype(data.dtype, scale_factor,
+                                        add_offset, mode='decoding')
             transform = partial(_scale_offset_decoding,
                                 scale_factor=scale_factor,
                                 add_offset=add_offset,

diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -222,7 +222,6 @@ def encode_cf_variable(var, needs_copy=True, name=None):
         A variable which has been encoded as described above.
     """
     ensure_not_multiindex(var, name=name)
-
     for coder in [times.CFDatetimeCoder(),
                   times.CFTimedeltaCoder(),
                   variables.CFScaleOffsetCoder(),

diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
@@ -30,7 +30,6 @@ def __eq__(self, other):
 INF = AlwaysGreaterThan()
 NINF = AlwaysLessThan()
 
-
 # Pairs of types that, if both found, should be promoted to object dtype
 # instead of following NumPy's own type-promotion rules. These type promotion
 # rules match pandas instead. For reference, see the NumPy type hierarchy:

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -685,7 +685,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn):
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
                 assert (decoded.variables[k].dtype
-                        == actual.variables[k].dtype)
+                        == actual.variables[k].dtype or
+                        (decoded.variables[k].dtype == np.float32 and
+                            actual.variables[k].dtype == np.float64))
             assert_allclose(decoded, actual, decode_bytes=False)
 
         with self.roundtrip(decoded,
@@ -711,7 +713,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn):
         with self.roundtrip(encoded) as actual:
             for k in decoded.variables:
                 assert (decoded.variables[k].dtype ==
-                        actual.variables[k].dtype)
+                        actual.variables[k].dtype or
+                        (decoded.variables[k].dtype == np.float32 and
+                            actual.variables[k].dtype == np.float64))
             assert_allclose(decoded, actual, decode_bytes=False)
 
     def test_coordinates_encoding(self):
@@ -1157,14 +1161,15 @@ def test_mask_and_scale(self):
                 v = nc.variables['x']
                 v.set_auto_maskandscale(False)
                 v.add_offset = 10
-                v.scale_factor = 0.1
+                v.scale_factor = np.float32(0.1)
                 v[:] = np.array([-1, -1, 0, 1, 2])
 
             # first make sure netCDF4 reads the masked and scaled data
             # correctly
             with nc4.Dataset(tmp_file, mode='r') as nc:
                 expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
-                                       mask=[True, True, False, False, False])
+                                       mask=[True, True, False, False, False],
+                                       dtype=np.float32)
                 actual = nc.variables['x'][:]
                 assert_array_equal(expected, actual)
 
@@ -1173,6 +1178,25 @@ def test_mask_and_scale(self):
                 expected = create_masked_and_scaled_data()
                 assert_identical(expected, ds)
 
+    def test_mask_and_scale_with_float64_scale_factor(self):
+        with create_tmp_file() as tmp_file:
+            with nc4.Dataset(tmp_file, mode='w') as nc:
+                nc.createDimension('t', 5)
+                nc.createVariable('x', 'int16', ('t',), fill_value=-1)
+                v = nc.variables['x']
+                v.scale_factor = 0.01
+                v.add_offset = 10
+                v[:] = np.array([-1123, -1123, 123, 1123, 2123])
+            # We read the newly created netcdf file
+            with nc4.Dataset(tmp_file, mode='r') as nc:
+                # we open the dataset
+                with open_dataset(tmp_file) as ds:
+                    # Both dataset values should be equal
+                    # And both of float64 array type
+                    dsv = ds['x'].values
+                    ncv = nc.variables['x'][:]
+                    np.testing.assert_array_almost_equal(dsv, ncv, 15)
+
     def test_0dimensional_variable(self):
         # This fix verifies our work-around to this netCDF4-python bug:
         # https://github.com/Unidata/netcdf4-python/pull/220

diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
@@ -50,3 +50,42 @@ def test_scaling_converts_to_float32(dtype):
     roundtripped = coder.decode(encoded)
     assert_identical(original, roundtripped)
     assert roundtripped.dtype == np.float32
+
+
+@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split())
+@pytest.mark.parametrize('scale_factor', [10, 0.01,
+                                          np.float16(10),
+                                          np.float32(10),
+                                          np.float64(10),
+                                          np.int8(10),
+                                          np.int16(10), np.int32(10),
+                                          np.int64(10), np.uint8(10),
+                                          np.uint16(10), np.uint32(10),
+                                          np.uint64(10), np.uint64(10)])
+@pytest.mark.parametrize('add_offset', [10, 0.01,
+                                        np.float16(10),
+                                        np.float32(10),
+                                        np.float64(10),
+                                        np.int8(10),
+                                        np.int16(10), np.int32(10),
+                                        np.int64(10), np.uint8(10),
+                                        np.uint16(10), np.uint32(10),
+                                        np.uint64(10), np.uint64(10)])
+def test_scaling_according_to_cf_convention(dtype, scale_factor, add_offset):
+    original = xr.Variable(('x',), np.arange(10, dtype=dtype),
+                           encoding=dict(scale_factor=scale_factor,
+                           add_offset=add_offset))
+    coder = variables.CFScaleOffsetCoder()
+    encoded = coder.encode(original)
+    assert encoded.dtype.itemsize >= np.dtype(dtype).itemsize
+    assert encoded.dtype.itemsize >= 4 and np.issubdtype(encoded, np.floating)
+
+    roundtripped = coder.decode(encoded)
+
+    # We make sure that roundtripped is larger than
+    # the original
+    assert roundtripped.dtype.itemsize >= original.dtype.itemsize
+    assert (roundtripped.dtype is np.dtype(np.float64)
+            or roundtripped.dtype is np.dtype(np.float32))
+
+    np.testing.assert_array_almost_equal(roundtripped, original)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -68,3 +68,4 @@ xarray/version.py
		Icon*

		.ipynb_checkpoints
		.nfs*