Skip to content

Commit 0cc522f

Browse files
authored
Merge pull request #4779 from pavlin-policar/feature-statistics-1
[FIX] normalize: Adjust number_of_decimals after scaling
2 parents 2ae1344 + f7ff577 commit 0cc522f

File tree

3 files changed

+45
-13
lines changed

3 files changed

+45
-13
lines changed

Orange/preprocess/normalize.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22

3-
from Orange.data import Domain
3+
from Orange.data import Domain, ContinuousVariable
44
from Orange.statistics import distribution
55
from Orange.util import Reprable
66
from .preprocess import Normalize
@@ -42,20 +42,27 @@ def normalize(self, dist, var):
4242
var = self.normalize_by_sd(dist, var)
4343
elif self.norm_type == Normalize.NormalizeBySpan:
4444
var = self.normalize_by_span(dist, var)
45-
var.number_of_decimals = None
4645
return var
4746

48-
def normalize_by_sd(self, dist, var):
47+
def normalize_by_sd(self, dist, var: ContinuousVariable) -> ContinuousVariable:
4948
avg, sd = (dist.mean(), dist.standard_deviation()) if dist.size else (0, 1)
5049
if sd == 0:
5150
sd = 1
5251
if self.center:
5352
compute_val = Norm(var, avg, 1 / sd)
5453
else:
5554
compute_val = Norm(var, 0, 1 / sd)
56-
return var.copy(compute_value=compute_val)
5755

58-
def normalize_by_span(self, dist, var):
56+
# When dealing with integers, and multiplying by something smaller than
57+
# 1, the number of decimals should be decreased, but this integer will
58+
# likely turn into a float, which should have some default number of
59+
# decimals
60+
num_decimals = var.number_of_decimals + int(np.round(np.log10(sd)))
61+
num_decimals = max(num_decimals, 1) # num decimals can't be negative
62+
63+
return var.copy(compute_value=compute_val, number_of_decimals=num_decimals)
64+
65+
def normalize_by_span(self, dist, var: ContinuousVariable) -> ContinuousVariable:
5966
dma, dmi = (dist.max(), dist.min()) if dist.shape[1] else (np.nan, np.nan)
6067
diff = dma - dmi
6168
if diff < 1e-15:
@@ -64,4 +71,9 @@ def normalize_by_span(self, dist, var):
6471
compute_val = Norm(var, dmi, 1 / diff)
6572
else:
6673
compute_val = Norm(var, (dma + dmi) / 2, 2 / diff)
67-
return var.copy(compute_value=compute_val)
74+
if not np.isnan(diff):
75+
num_decimals = var.number_of_decimals + int(np.ceil(np.log10(diff)))
76+
num_decimals = max(num_decimals, 0) # num decimals can't be negative
77+
return var.copy(compute_value=compute_val, number_of_decimals=num_decimals)
78+
else:
79+
return var.copy(compute_value=compute_val)

Orange/tests/test_normalize.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,12 @@ def test_number_of_decimals(self):
159159
data = Table.from_list(Domain((foo,)), [[1], [2], [3]])
160160

161161
normalized = Normalize()(data)
162-
norm_foo = normalized.domain.attributes[0]
162+
norm_foo: ContinuousVariable = normalized.domain.attributes[0]
163163

164-
self.assertEqual(norm_foo.number_of_decimals, 3)
165-
self.assertEqual(norm_foo.format_str, "%g")
166-
self.assertEqual(norm_foo.adjust_decimals, 2)
164+
self.assertGreater(norm_foo.number_of_decimals, 0)
167165

168166
for val1, val2 in zip(normalized[:, "Foo"],
169-
["-1.22474", "0", "1.22474"]):
167+
["-1.225", "0.0", "1.225"]):
170168
self.assertEqual(str(val1[0]), val2)
171169

172170

Orange/widgets/data/owfeaturestatistics.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ def _categorical_entropy(x):
4141
return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
4242

4343

44+
def coefficient_of_variation(x: np.ndarray) -> np.ndarray:
45+
mu = ut.nanmean(x, axis=0)
46+
mask = ~np.isclose(mu, 0, atol=1e-12)
47+
result = np.full_like(mu, fill_value=np.inf)
48+
result[mask] = np.sqrt(ut.nanvar(x, axis=0)[mask]) / mu[mask]
49+
return result
50+
51+
4452
def format_time_diff(start, end, round_up_after=2):
4553
"""Return an approximate human readable time difference between two dates.
4654
@@ -230,7 +238,7 @@ def __compute_statistics(self):
230238
self._dispersion = self.__compute_stat(
231239
matrices,
232240
discrete_f=_categorical_entropy,
233-
continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
241+
continuous_f=coefficient_of_variation,
234242
)
235243
self._missing = self.__compute_stat(
236244
matrices,
@@ -487,8 +495,22 @@ def decoration():
487495

488496
def display():
489497
# pylint: disable=too-many-branches
498+
def format_zeros(str_val):
499+
"""Zeros should be handled separately as they cannot be negative."""
500+
if float(str_val) == 0:
501+
num_decimals = min(self.variables[row].number_of_decimals, 2)
502+
str_val = f"{0:.{num_decimals}f}"
503+
return str_val
504+
490505
def render_value(value):
491-
return "" if np.isnan(value) else attribute.str_val(value)
506+
if np.isnan(value):
507+
return ""
508+
509+
str_val = attribute.str_val(value)
510+
if attribute.is_continuous:
511+
str_val = format_zeros(str_val)
512+
513+
return str_val
492514

493515
if column == self.Columns.NAME:
494516
return attribute.name

0 commit comments

Comments
 (0)