Skip to content

Commit 9f49bc2

Browse files
janezdmarkotoplak
authored andcommitted
Feature as Predictor: New widget
1 parent 32f4d6d commit 9f49bc2

File tree

8 files changed

+1060
-12
lines changed

8 files changed

+1060
-12
lines changed

Orange/modelling/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .randomforest import *
1212
from .svm import *
1313
from .tree import *
14+
from .column import *
1415
try:
1516
from .catgb import *
1617
except ImportError:

Orange/modelling/column.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from typing import Optional
2+
3+
import numpy as np
4+
5+
from Orange.data import Variable, DiscreteVariable, Domain, Table
6+
from Orange.classification import LogisticRegressionLearner
7+
from Orange.regression import LinearRegressionLearner
8+
from Orange.modelling import Model, Learner
9+
10+
__all__ = ["ColumnLearner", "ColumnModel"]
11+
12+
13+
def _check_column_combinations(
14+
class_var: Variable,
15+
column: Variable,
16+
fit_regression: bool):
17+
if class_var.is_continuous:
18+
if not column.is_continuous:
19+
raise ValueError(
20+
"Regression can only be used with numeric variables")
21+
return
22+
23+
assert isinstance(class_var, DiscreteVariable) # remove type warnings
24+
if column.is_continuous:
25+
if len(class_var.values) != 2:
26+
raise ValueError(
27+
"Numeric columns can only be used with binary class variables")
28+
else:
29+
assert isinstance(column, DiscreteVariable)
30+
if not valid_value_sets(class_var, column):
31+
raise ValueError(
32+
"Column contains values that are not in class variable")
33+
if fit_regression and not column.is_continuous:
34+
raise ValueError(
35+
"Intercept and coefficient are only allowed for continuous "
36+
"variables")
37+
38+
39+
def valid_prob_range(values: np.ndarray):
40+
return np.nanmin(values) >= 0 and np.nanmax(values) <= 1
41+
42+
43+
def valid_value_sets(class_var: DiscreteVariable,
44+
column_var: DiscreteVariable):
45+
return set(column_var.values) <= set(class_var.values)
46+
47+
48+
class ColumnLearner(Learner):
49+
def __init__(self,
50+
class_var: Variable,
51+
column: Variable,
52+
fit_regression: bool = False):
53+
super().__init__()
54+
_check_column_combinations(class_var, column, fit_regression)
55+
self.class_var = class_var
56+
self.column = column
57+
self.fit_regression = fit_regression
58+
self.name = f"column '{column.name}'"
59+
60+
def __fit_coefficients(self, data: Table):
61+
# Use learners from Orange rather than directly calling
62+
# scikit-learn, so that we make sure we use the same parameters
63+
# and get the same result as we would if we used the widgets.
64+
data1 = data.transform(Domain([self.column], self.class_var))
65+
if self.class_var.is_discrete:
66+
model = LogisticRegressionLearner()(data1)
67+
return model.intercept[0], model.coefficients[0][0]
68+
else:
69+
model = LinearRegressionLearner()(data1)
70+
return model.intercept, model.coefficients[0]
71+
72+
def fit_storage(self, data: Table):
73+
if data.domain.class_var != self.class_var:
74+
raise ValueError("Class variable does not match the data")
75+
if not self.fit_regression:
76+
return ColumnModel(self.class_var, self.column)
77+
78+
intercept, coefficient = self.__fit_coefficients(data)
79+
return ColumnModel(self.class_var, self.column, intercept, coefficient)
80+
81+
82+
class ColumnModel(Model):
83+
def __init__(self,
84+
class_var: Variable,
85+
column: Variable,
86+
intercept: Optional[float] = None,
87+
coefficient: Optional[float] = None):
88+
super().__init__(Domain([column], class_var))
89+
90+
_check_column_combinations(class_var, column, intercept is not None)
91+
if (intercept is not None) is not (coefficient is not None):
92+
raise ValueError(
93+
"Intercept and coefficient must both be provided or absent")
94+
95+
self.class_var = class_var
96+
self.column = column
97+
self.intercept = intercept
98+
self.coefficient = coefficient
99+
if (column.is_discrete and
100+
class_var.values[:len(column.values)] != column.values):
101+
self.value_mapping = np.array([class_var.to_val(x)
102+
for x in column.values])
103+
else:
104+
self.value_mapping = None
105+
106+
pars = f" ({intercept}, {coefficient})" if intercept is not None else ""
107+
self.name = f"column '{column.name}'{pars}"
108+
109+
def predict_storage(self, data: Table):
110+
vals = data.get_column(self.column)
111+
if self.class_var.is_discrete:
112+
return self._predict_discrete(vals)
113+
else:
114+
return self._predict_continuous(vals)
115+
116+
def _predict_discrete(self, vals):
117+
assert isinstance(self.class_var, DiscreteVariable)
118+
nclasses = len(self.class_var.values)
119+
proba = np.full((len(vals), nclasses), np.nan)
120+
rows = np.isfinite(vals)
121+
if self.column.is_discrete:
122+
mapped = vals[rows].astype(int)
123+
if self.value_mapping is not None:
124+
mapped = self.value_mapping[mapped]
125+
vals = vals.copy()
126+
vals[rows] = mapped
127+
proba[rows] = 0
128+
proba[rows, mapped] = 1
129+
else:
130+
if self.coefficient is None:
131+
if not valid_prob_range(vals):
132+
raise ValueError("Column values must be in [0, 1] range "
133+
"unless logistic function is applied")
134+
proba[rows, 1] = vals[rows]
135+
else:
136+
proba[rows, 1] = (
137+
1 /
138+
(1 + np.exp(-self.intercept - self.coefficient * vals[rows])
139+
))
140+
141+
proba[rows, 0] = 1 - proba[rows, 1]
142+
vals = (proba[:, 1] > 0.5).astype(float)
143+
vals[~rows] = np.nan
144+
return vals, proba
145+
146+
def _predict_continuous(self, vals):
147+
if self.coefficient is None:
148+
return vals
149+
else:
150+
return vals * self.coefficient + self.intercept
151+
152+
def __str__(self):
153+
pars = f" ({self.intercept}, {self.coefficient})" \
154+
if self.intercept is not None else ""
155+
return f'ColumnModel {self.column.name}{pars}'

0 commit comments

Comments
 (0)