diff --git a/Orange/preprocess/normalize.py b/Orange/preprocess/normalize.py index 753e9c340fa..83868ee5d11 100644 --- a/Orange/preprocess/normalize.py +++ b/Orange/preprocess/normalize.py @@ -1,6 +1,6 @@ import numpy as np -from Orange.data import Domain +from Orange.data import Domain, ContinuousVariable from Orange.statistics import distribution from Orange.util import Reprable from .preprocess import Normalize @@ -42,10 +42,9 @@ def normalize(self, dist, var): var = self.normalize_by_sd(dist, var) elif self.norm_type == Normalize.NormalizeBySpan: var = self.normalize_by_span(dist, var) - var.number_of_decimals = None return var - def normalize_by_sd(self, dist, var): + def normalize_by_sd(self, dist, var: ContinuousVariable) -> ContinuousVariable: avg, sd = (dist.mean(), dist.standard_deviation()) if dist.size else (0, 1) if sd == 0: sd = 1 @@ -53,9 +52,17 @@ def normalize_by_sd(self, dist, var): compute_val = Norm(var, avg, 1 / sd) else: compute_val = Norm(var, 0, 1 / sd) - return var.copy(compute_value=compute_val) - def normalize_by_span(self, dist, var): + # When dealing with integers, and multiplying by something smaller than + # 1, the number of decimals should be decreased, but this integer will + # likely turn into a float, which should have some default number of + # decimals + num_decimals = var.number_of_decimals + int(np.round(np.log10(sd))) + num_decimals = max(num_decimals, 1) # num decimals can't be negative + + return var.copy(compute_value=compute_val, number_of_decimals=num_decimals) + + def normalize_by_span(self, dist, var: ContinuousVariable) -> ContinuousVariable: dma, dmi = (dist.max(), dist.min()) if dist.shape[1] else (np.nan, np.nan) diff = dma - dmi if diff < 1e-15: @@ -64,4 +71,9 @@ def normalize_by_span(self, dist, var): compute_val = Norm(var, dmi, 1 / diff) else: compute_val = Norm(var, (dma + dmi) / 2, 2 / diff) - return var.copy(compute_value=compute_val) + if not np.isnan(diff): + num_decimals = var.number_of_decimals + int(np.ceil(np.log10(diff))) + num_decimals = max(num_decimals, 0) # num decimals can't be negative + return var.copy(compute_value=compute_val, number_of_decimals=num_decimals) + else: + return var.copy(compute_value=compute_val) diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py index 84c38d6dc70..d58e9daae9f 100644 --- a/Orange/tests/test_normalize.py +++ b/Orange/tests/test_normalize.py @@ -159,14 +159,12 @@ def test_number_of_decimals(self): data = Table.from_list(Domain((foo,)), [[1], [2], [3]]) normalized = Normalize()(data) - norm_foo = normalized.domain.attributes[0] + norm_foo: ContinuousVariable = normalized.domain.attributes[0] - self.assertEqual(norm_foo.number_of_decimals, 3) - self.assertEqual(norm_foo.format_str, "%g") - self.assertEqual(norm_foo.adjust_decimals, 2) + self.assertGreater(norm_foo.number_of_decimals, 0) for val1, val2 in zip(normalized[:, "Foo"], - ["-1.22474", "0", "1.22474"]): + ["-1.225", "0.0", "1.225"]): self.assertEqual(str(val1[0]), val2) diff --git a/Orange/widgets/data/owfeaturestatistics.py b/Orange/widgets/data/owfeaturestatistics.py index 5005a0e9f91..d0b14683384 100644 --- a/Orange/widgets/data/owfeaturestatistics.py +++ b/Orange/widgets/data/owfeaturestatistics.py @@ -39,6 +39,14 @@ def _categorical_entropy(x): return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64) +def coefficient_of_variation(x: np.ndarray) -> np.ndarray: + mu = ut.nanmean(x, axis=0) + mask = ~np.isclose(mu, 0, atol=1e-12) + result = np.full_like(mu, fill_value=np.inf) + result[mask] = np.sqrt(ut.nanvar(x, axis=0)[mask]) / mu[mask] + return result + + def format_time_diff(start, end, round_up_after=2): """Return an approximate human readable time difference between two dates. @@ -228,7 +236,7 @@ def __compute_statistics(self): self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, - continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), + continuous_f=coefficient_of_variation, ) self._missing = self.__compute_stat( matrices, @@ -485,8 +493,22 @@ def decoration(): def display(): # pylint: disable=too-many-branches + def format_zeros(str_val): + """Zeros should be handled separately as they cannot be negative.""" + if float(str_val) == 0: + num_decimals = min(self.variables[row].number_of_decimals, 2) + str_val = f"{0:.{num_decimals}f}" + return str_val + def render_value(value): - return "" if np.isnan(value) else attribute.str_val(value) + if np.isnan(value): + return "" + + str_val = attribute.str_val(value) + if attribute.is_continuous: + str_val = format_zeros(str_val) + + return str_val if column == self.Columns.NAME: return attribute.name