Merge pull request #4779 from pavlin-policar/feature-statistics-1

lanzagar · web-flow · commit 0cc522f8e300 · 2020-06-19T14:46:59.000+02:00
[FIX] normalize: Adjust number_of_decimals after scaling
diff --git a/Orange/preprocess/normalize.py b/Orange/preprocess/normalize.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from Orange.data import Domain
+from Orange.data import Domain, ContinuousVariable
 from Orange.statistics import distribution
 from Orange.util import Reprable
 from .preprocess import Normalize
@@ -42,20 +42,27 @@ def normalize(self, dist, var):
             var = self.normalize_by_sd(dist, var)
         elif self.norm_type == Normalize.NormalizeBySpan:
             var = self.normalize_by_span(dist, var)
-        var.number_of_decimals = None
         return var
 
-    def normalize_by_sd(self, dist, var):
+    def normalize_by_sd(self, dist, var: ContinuousVariable) -> ContinuousVariable:
         avg, sd = (dist.mean(), dist.standard_deviation()) if dist.size else (0, 1)
         if sd == 0:
             sd = 1
         if self.center:
             compute_val = Norm(var, avg, 1 / sd)
         else:
             compute_val = Norm(var, 0, 1 / sd)
-        return var.copy(compute_value=compute_val)
 
-    def normalize_by_span(self, dist, var):
+        # When dealing with integers, and multiplying by something smaller than
+        # 1, the number of decimals should be decreased, but this integer will
+        # likely turn into a float, which should have some default number of
+        # decimals
+        num_decimals = var.number_of_decimals + int(np.round(np.log10(sd)))
+        num_decimals = max(num_decimals, 1)  # num decimals can't be negative
+
+        return var.copy(compute_value=compute_val, number_of_decimals=num_decimals)
+
+    def normalize_by_span(self, dist, var: ContinuousVariable) -> ContinuousVariable:
         dma, dmi = (dist.max(), dist.min()) if dist.shape[1] else (np.nan, np.nan)
         diff = dma - dmi
         if diff < 1e-15:
@@ -64,4 +71,9 @@ def normalize_by_span(self, dist, var):
             compute_val = Norm(var, dmi, 1 / diff)
         else:
             compute_val = Norm(var, (dma + dmi) / 2, 2 / diff)
-        return var.copy(compute_value=compute_val)
+        if not np.isnan(diff):
+            num_decimals = var.number_of_decimals + int(np.ceil(np.log10(diff)))
+            num_decimals = max(num_decimals, 0)  # num decimals can't be negative
+            return var.copy(compute_value=compute_val, number_of_decimals=num_decimals)
+        else:
+            return var.copy(compute_value=compute_val)
diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py
@@ -159,14 +159,12 @@ def test_number_of_decimals(self):
         data = Table.from_list(Domain((foo,)), [[1], [2], [3]])
 
         normalized = Normalize()(data)
-        norm_foo = normalized.domain.attributes[0]
+        norm_foo: ContinuousVariable = normalized.domain.attributes[0]
 
-        self.assertEqual(norm_foo.number_of_decimals, 3)
-        self.assertEqual(norm_foo.format_str, "%g")
-        self.assertEqual(norm_foo.adjust_decimals, 2)
+        self.assertGreater(norm_foo.number_of_decimals, 0)
 
         for val1, val2 in zip(normalized[:, "Foo"],
-                              ["-1.22474", "0", "1.22474"]):
+                              ["-1.225", "0.0", "1.225"]):
             self.assertEqual(str(val1[0]), val2)
 
 
diff --git a/Orange/widgets/data/owfeaturestatistics.py b/Orange/widgets/data/owfeaturestatistics.py
@@ -41,6 +41,14 @@ def _categorical_entropy(x):
     return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
 
 
+def coefficient_of_variation(x: np.ndarray) -> np.ndarray:
+    mu = ut.nanmean(x, axis=0)
+    mask = ~np.isclose(mu, 0, atol=1e-12)
+    result = np.full_like(mu, fill_value=np.inf)
+    result[mask] = np.sqrt(ut.nanvar(x, axis=0)[mask]) / mu[mask]
+    return result
+
+
 def format_time_diff(start, end, round_up_after=2):
     """Return an approximate human readable time difference between two dates.
 
@@ -230,7 +238,7 @@ def __compute_statistics(self):
         self._dispersion = self.__compute_stat(
             matrices,
             discrete_f=_categorical_entropy,
-            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
+            continuous_f=coefficient_of_variation,
         )
         self._missing = self.__compute_stat(
             matrices,
@@ -487,8 +495,22 @@ def decoration():
 
         def display():
             # pylint: disable=too-many-branches
+            def format_zeros(str_val):
+                """Zeros should be handled separately as they cannot be negative."""
+                if float(str_val) == 0:
+                    num_decimals = min(self.variables[row].number_of_decimals, 2)
+                    str_val = f"{0:.{num_decimals}f}"
+                return str_val
+
             def render_value(value):
-                return "" if np.isnan(value) else attribute.str_val(value)
+                if np.isnan(value):
+                    return ""
+
+                str_val = attribute.str_val(value)
+                if attribute.is_continuous:
+                    str_val = format_zeros(str_val)
+
+                return str_val
 
             if column == self.Columns.NAME:
                 return attribute.name