Skip to content

Commit e572020

Browse files
authored
Merge pull request #4466 from janezd/owcontinuize-simplify-normalize
[ENH] OWContinuize: Provide the same options as in Preprocess/Normalize
2 parents 6e010ef + a89beee commit e572020

File tree

2 files changed

+185
-159
lines changed

2 files changed

+185
-159
lines changed

Orange/widgets/data/owcontinuize.py

Lines changed: 98 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from functools import reduce
2+
from types import SimpleNamespace
23

34
from AnyQt.QtCore import Qt
45

56
import Orange.data
67
from Orange.util import Reprable
78
from Orange.statistics import distribution
8-
from Orange.preprocess import Continuize, Normalize
9-
from Orange.preprocess.transformation import \
10-
Identity, Indicator, Indicator1, Normalizer
9+
from Orange.preprocess import Continuize
10+
from Orange.preprocess.transformation import Identity, Indicator, Normalizer
1111
from Orange.data.table import Table
1212
from Orange.widgets import gui, widget
1313
from Orange.widgets.settings import Setting
@@ -34,16 +34,13 @@ class Outputs:
3434
buttons_area_orientation = Qt.Vertical
3535
resizing_enabled = False
3636

37-
# continuous treats
38-
Leave, NormalizeBySpan, NormalizeBySD = range(3)
37+
Normalize = SimpleNamespace(Leave=0, Standardize=1, Center=2, Scale=3,
38+
Normalize11=4, Normalize01=5)
3939

40+
settings_version = 2
4041
multinomial_treatment = Setting(0)
41-
zero_based = Setting(1)
42-
continuous_treatment = Setting(Leave)
42+
continuous_treatment = Setting(Normalize.Leave)
4343
class_treatment = Setting(0)
44-
45-
transform_class = Setting(False)
46-
4744
autosend = Setting(True)
4845

4946
multinomial_treats = (
@@ -56,9 +53,13 @@ class Outputs:
5653
("Divide by number of values", Continuize.AsNormalizedOrdinal))
5754

5855
continuous_treats = (
59-
("Leave them as they are", Continuize.Leave),
60-
("Normalize by span", Normalize.NormalizeBySpan),
61-
("Normalize by standard deviation", Normalize.NormalizeBySD))
56+
("Leave them as they are", True),
57+
("Standardize to μ=0, σ²=1", False),
58+
("Center to μ=0", False),
59+
("Scale to σ²=1", True),
60+
("Normalize to interval [-1, 1]", False),
61+
("Normalize to interval [0, 1]", False)
62+
)
6263

6364
class_treats = (
6465
("Leave it as it is", Continuize.Leave),
@@ -67,8 +68,6 @@ class Outputs:
6768
("One class per value", Continuize.Indicators),
6869
)
6970

70-
value_ranges = ["From -1 to 1", "From 0 to 1"]
71-
7271
def __init__(self):
7372
super().__init__()
7473

@@ -84,19 +83,12 @@ def __init__(self):
8483
btnLabels=[x[0] for x in self.continuous_treats],
8584
callback=self.settings_changed)
8685

87-
box = gui.vBox(self.controlArea, "Categorical Outcomes")
86+
box = gui.vBox(self.controlArea, "Categorical Outcome(s)")
8887
gui.radioButtonsInBox(
8988
box, self, "class_treatment",
9089
btnLabels=[t[0] for t in self.class_treats],
9190
callback=self.settings_changed)
9291

93-
zbbox = gui.vBox(self.controlArea, "Value Range")
94-
95-
gui.radioButtonsInBox(
96-
zbbox, self, "zero_based",
97-
btnLabels=self.value_ranges,
98-
callback=self.settings_changed)
99-
10092
gui.auto_apply(self.buttonsArea, self, "autosend", box=False)
10193

10294
self.data = None
@@ -120,31 +112,27 @@ def setData(self, data):
120112
self.unconditional_commit()
121113

122114
def enable_normalization(self):
123-
enable = not (self.data and self.data.is_sparse())
124-
if not enable and self.continuous_treatment in (self.NormalizeBySpan,
125-
self.NormalizeBySD):
126-
self.continuous_treatment = self.Leave
127115
buttons = self.controls.continuous_treatment.buttons
128-
buttons[self.NormalizeBySpan].setEnabled(enable)
129-
buttons[self.NormalizeBySD].setEnabled(enable)
116+
if self.data is not None and self.data.is_sparse():
117+
if self.continuous_treatment == self.Normalize.Standardize:
118+
self.continuous_treatment = self.Normalize.Scale
119+
else:
120+
self.continuous_treatment = self.Normalize.Leave
121+
for button, (_, supports_sparse) \
122+
in zip(buttons, self.continuous_treats):
123+
button.setEnabled(supports_sparse)
124+
else:
125+
for button in buttons:
126+
button.setEnabled(True)
130127

131128
def constructContinuizer(self):
132129
conzer = DomainContinuizer(
133-
zero_based=self.zero_based,
134130
multinomial_treatment=self.multinomial_treats[self.multinomial_treatment][1],
135-
continuous_treatment=self.continuous_treats[self.continuous_treatment][1],
131+
continuous_treatment=self.continuous_treatment,
136132
class_treatment=self.class_treats[self.class_treatment][1]
137133
)
138134
return conzer
139135

140-
# def sendPreprocessor(self):
141-
# continuizer = self.constructContinuizer()
142-
# self.send("Preprocessor", PreprocessedLearner(
143-
# lambda data, weightId=0, tc=(self.targetValue if self.classTreatment else -1):
144-
# Table(continuizer(data, weightId, tc)
145-
# if data.domain.has_discrete_class
146-
# else continuizer(data, weightId), data)))
147-
148136
def commit(self):
149137
continuizer = self.constructContinuizer()
150138
if self.data:
@@ -155,16 +143,28 @@ def commit(self):
155143
else:
156144
self.Outputs.data.send(self.data) # None or empty data
157145

158-
159146
def send_report(self):
160147
self.report_items(
161148
"Settings",
162149
[("Categorical features",
163150
self.multinomial_treats[self.multinomial_treatment][0]),
164151
("Numeric features",
165152
self.continuous_treats[self.continuous_treatment][0]),
166-
("Class", self.class_treats[self.class_treatment][0]),
167-
("Value range", self.value_ranges[self.zero_based])])
153+
("Class", self.class_treats[self.class_treatment][0])])
154+
155+
@classmethod
156+
def migrate_settings(cls, settings, version):
157+
if version < 2:
158+
Normalize = cls.Normalize
159+
cont_treat = settings.pop("continuous_treatment", 0)
160+
zero_based = settings.pop("zero_based", True)
161+
if cont_treat == 1:
162+
if zero_based:
163+
settings["continuous_treatment"] = Normalize.Normalize01
164+
else:
165+
settings["continuous_treatment"] = Normalize.Normalize11
166+
elif cont_treat == 2:
167+
settings["continuous_treatment"] = Normalize.Standardize
168168

169169

170170
class WeightedIndicator(Indicator):
@@ -179,56 +179,33 @@ def transform(self, c):
179179
return t
180180

181181

182-
class WeightedIndicator1(Indicator1):
183-
def __init__(self, variable, value, weight=1.0):
184-
super().__init__(variable, value)
185-
self.weight = weight
186-
187-
def transform(self, c):
188-
t = super().transform(c) * self.weight
189-
if self.weight != 1.0:
190-
t *= self.weight
191-
return t
192-
193-
194-
def make_indicator_var(source, value_ind, weight=None, zero_based=True):
195-
if zero_based and weight is None:
182+
def make_indicator_var(source, value_ind, weight=None):
183+
if weight is None:
196184
indicator = Indicator(source, value=value_ind)
197-
elif zero_based:
198-
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
199-
elif weight is None:
200-
indicator = Indicator1(source, value=value_ind)
201185
else:
202-
indicator = WeightedIndicator1(source, value=value_ind, weight=weight)
186+
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
203187
return Orange.data.ContinuousVariable(
204188
"{}={}".format(source.name, source.values[value_ind]),
205189
compute_value=indicator
206190
)
207191

208192

209-
def dummy_coding(var, base_value=0, zero_based=True):
193+
def dummy_coding(var, base_value=0):
210194
N = len(var.values)
211-
return [make_indicator_var(var, i, zero_based=zero_based)
195+
return [make_indicator_var(var, i)
212196
for i in range(N) if i != base_value]
213197

214198

215-
def one_hot_coding(var, zero_based=True):
199+
def one_hot_coding(var):
216200
N = len(var.values)
217-
return [make_indicator_var(var, i, zero_based=zero_based)
218-
for i in range(N)]
201+
return [make_indicator_var(var, i) for i in range(N)]
219202

220203

221-
def continuize_domain(data_or_domain,
204+
def continuize_domain(data,
222205
multinomial_treatment=Continuize.Indicators,
223206
continuous_treatment=Continuize.Leave,
224-
class_treatment=Continuize.Leave,
225-
zero_based=True):
226-
227-
if isinstance(data_or_domain, Orange.data.Domain):
228-
data, domain = None, data_or_domain
229-
else:
230-
data, domain = data_or_domain, data_or_domain.domain
231-
207+
class_treatment=Continuize.Leave):
208+
domain = data.domain
232209
def needs_dist(var, mtreat, ctreat):
233210
"Does the `var` need a distribution given specified flags"
234211
if var.is_discrete:
@@ -258,14 +235,11 @@ def needs_dist(var, mtreat, ctreat):
258235
dist_iter = iter(dist)
259236

260237
newattrs = [continuize_var(var, next(dist_iter) if needs_dist else None,
261-
multinomial_treatment, continuous_treatment,
262-
zero_based)
238+
multinomial_treatment, continuous_treatment)
263239
for var, needs_dist in zip(domain.attributes, attr_needs_dist)]
264-
265240
newclass = [continuize_var(var,
266241
next(dist_iter) if needs_dist else None,
267-
class_treatment, Continuize.Remove,
268-
zero_based)
242+
class_treatment, Continuize.Remove)
269243
for var, needs_dist in zip(domain.class_vars, cls_needs_dist)]
270244

271245
newattrs = reduce(list.__iadd__, newattrs, [])
@@ -276,16 +250,16 @@ def needs_dist(var, mtreat, ctreat):
276250
def continuize_var(var,
277251
data_or_dist=None,
278252
multinomial_treatment=Continuize.Indicators,
279-
continuous_treatment=Continuize.Leave,
280-
zero_based=True):
281-
253+
continuous_treatment=Continuize.Leave):
282254
def continuize_continuous():
283-
if continuous_treatment == Normalize.NormalizeBySpan:
284-
return [normalize_by_span(var, data_or_dist, zero_based)]
285-
elif continuous_treatment == Normalize.NormalizeBySD:
286-
return [normalize_by_sd(var, data_or_dist)]
287-
else:
255+
dist = _ensure_dist(var, data_or_dist)
256+
treatments = [lambda var, _: var,
257+
normalize_by_sd, center_to_mean, divide_by_sd,
258+
normalize_to_11, normalize_to_01]
259+
if dist.shape[1] == 0:
288260
return [var]
261+
new_var = treatments[continuous_treatment](var, dist)
262+
return [new_var]
289263

290264
def continuize_discrete():
291265
if len(var.values) > 2 and \
@@ -299,16 +273,16 @@ def continuize_discrete():
299273
elif multinomial_treatment == Continuize.AsOrdinal:
300274
return [ordinal_to_continuous(var)]
301275
elif multinomial_treatment == Continuize.AsNormalizedOrdinal:
302-
return [ordinal_to_norm_continuous(var, zero_based)]
276+
return [ordinal_to_norm_continuous(var)]
303277
elif multinomial_treatment == Continuize.Indicators:
304-
return one_hot_coding(var, zero_based)
278+
return one_hot_coding(var)
305279
elif multinomial_treatment in (
306280
Continuize.FirstAsBase, Continuize.RemoveMultinomial):
307-
return dummy_coding(var, zero_based=zero_based)
281+
return dummy_coding(var)
308282
elif multinomial_treatment == Continuize.FrequentAsBase:
309283
dist = _ensure_dist(var, data_or_dist)
310284
modus = dist.modus()
311-
return dummy_coding(var, base_value=modus, zero_based=zero_based)
285+
return dummy_coding(var, base_value=modus)
312286
elif multinomial_treatment == Continuize.Leave:
313287
return [var]
314288
raise ValueError("Invalid value of `multinomial_treatment`")
@@ -345,68 +319,67 @@ def ordinal_to_continuous(var):
345319
compute_value=Identity(var))
346320

347321

348-
def ordinal_to_norm_continuous(var, zero_based=True):
322+
def ordinal_to_norm_continuous(var):
349323
n_values = len(var.values)
350-
if zero_based:
351-
return normalized_var(var, 0, 1 / (n_values - 1))
352-
else:
353-
return normalized_var(var, (n_values - 1) / 2, 2 / (n_values - 1))
324+
return normalized_var(var, 0, 1 / (n_values - 1))
354325

355326

356-
def normalize_by_span(var, data_or_dist, zero_based=True):
357-
dist = _ensure_dist(var, data_or_dist)
358-
if dist.shape[1] > 0:
359-
v_max, v_min = dist.max(), dist.min()
360-
else:
361-
v_max, v_min = 0, 0
327+
def normalize_by_sd(var, dist):
328+
mean, sd = dist.mean(), dist.standard_deviation()
329+
sd = sd if sd > 1e-10 else 1
330+
return normalized_var(var, mean, 1 / sd)
331+
332+
333+
def center_to_mean(var, dist):
334+
return normalized_var(var, dist.mean(), 1)
335+
336+
337+
def divide_by_sd(var, dist):
338+
sd = dist.standard_deviation()
339+
sd = sd if sd > 1e-10 else 1
340+
return normalized_var(var, 0, 1 / sd)
341+
342+
343+
def normalize_to_11(var, dist):
344+
return normalize_by_span(var, dist, False)
345+
346+
347+
def normalize_to_01(var, dist):
348+
return normalize_by_span(var, dist, True)
349+
350+
351+
def normalize_by_span(var, dist, zero_based=True):
352+
v_max, v_min = dist.max(), dist.min()
362353
span = (v_max - v_min)
363354
if span < 1e-15:
364355
span = 1
365-
366356
if zero_based:
367357
return normalized_var(var, v_min, 1 / span)
368358
else:
369359
return normalized_var(var, (v_min + v_max) / 2, 2 / span)
370360

371361

372-
def normalize_by_sd(var, data_or_dist):
373-
dist = _ensure_dist(var, data_or_dist)
374-
if dist.shape[1] > 0:
375-
mean, sd = dist.mean(), dist.standard_deviation()
376-
else:
377-
mean, sd = 0, 1
378-
sd = sd if sd > 1e-10 else 1
379-
return normalized_var(var, mean, 1 / sd)
380-
381-
382362
class DomainContinuizer(Reprable):
383-
def __init__(self, zero_based=True,
363+
def __init__(self,
384364
multinomial_treatment=Continuize.Indicators,
385365
continuous_treatment=Continuize.Leave,
386366
class_treatment=Continuize.Leave):
387-
self.zero_based = zero_based
388367
self.multinomial_treatment = multinomial_treatment
389368
self.continuous_treatment = continuous_treatment
390369
self.class_treatment = class_treatment
391370

392371
def __call__(self, data):
393372
treat = self.multinomial_treatment
394-
if isinstance(data, Orange.data.Domain):
395-
domain, data = data, None
396-
else:
397-
domain = data.domain
398-
373+
domain = data.domain
399374
if (treat == Continuize.ReportError and
400375
any(var.is_discrete and len(var.values) > 2 for var in domain)):
401376
raise ValueError("Domain has multinomial attributes")
402377

403378
newdomain = continuize_domain(
404-
data or domain,
379+
data,
405380
self.multinomial_treatment,
406381
self.continuous_treatment,
407-
self.class_treatment,
408-
self.zero_based
409-
)
382+
self.class_treatment)
410383
return newdomain
411384

412385

0 commit comments

Comments
 (0)