Skip to content

Commit ef3e166

Browse files
authored
Merge pull request #6817 from ZanMervic/scoring-sheet
[ENH] ScoringSheet and ScoringSheetViewer widgets added
2 parents c27a803 + 12bd5a1 commit ef3e166

29 files changed

+2633
-0
lines changed

.coveragerc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ omit =
1010
*/tests/*
1111
*/setup.py
1212
*/*/setup.py
13+
Orange/classification/utils/fasterrisk/*
1314

1415
[report]
1516
exclude_lines =

Orange/classification/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from .sgd import *
2121
from .neural_network import *
2222
from .calibration import *
23+
from .scoringsheet import *
2324
try:
2425
from .catgb import *
2526
except ModuleNotFoundError:
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
from Orange.classification.utils.fasterrisk.fasterrisk import (
3+
RiskScoreOptimizer,
4+
RiskScoreClassifier,
5+
)
6+
7+
from Orange.classification import Learner, Model
8+
from Orange.data import Table, Storage
9+
from Orange.data.filter import HasClass
10+
from Orange.preprocess import Discretize, Impute, Continuize, SelectBestFeatures
11+
from Orange.preprocess.discretize import Binning
12+
from Orange.preprocess.score import ReliefF
13+
14+
15+
def _change_class_var_values(y):
16+
"""
17+
Changes the class variable values from 0 and 1 to -1 and 1 or vice versa.
18+
"""
19+
return np.where(y == 0, -1, np.where(y == -1, 0, y))
20+
21+
22+
class ScoringSheetModel(Model):
23+
def __init__(self, model):
24+
self.model = model
25+
super().__init__()
26+
27+
def predict_storage(self, table):
28+
if not isinstance(table, Storage):
29+
raise TypeError("Data is not a subclass of Orange.data.Storage.")
30+
31+
y_pred = _change_class_var_values(self.model.predict(table.X))
32+
y_prob = self.model.predict_prob(table.X)
33+
34+
scores = np.hstack(((1 - y_prob).reshape(-1, 1), y_prob.reshape(-1, 1)))
35+
return y_pred, scores
36+
37+
38+
class ScoringSheetLearner(Learner):
39+
__returns__ = ScoringSheetModel
40+
preprocessors = [HasClass(), Discretize(method=Binning()), Impute(), Continuize()]
41+
42+
def __init__(
43+
self,
44+
num_attr_after_selection=20,
45+
num_decision_params=5,
46+
max_points_per_param=5,
47+
num_input_features=None,
48+
preprocessors=None,
49+
):
50+
# Set the num_decision_params, max_points_per_param, and num_input_features normally
51+
self.num_decision_params = num_decision_params
52+
self.max_points_per_param = max_points_per_param
53+
self.num_input_features = num_input_features
54+
self.feature_to_group = None
55+
56+
if preprocessors is None:
57+
self.preprocessors = [
58+
*self.preprocessors,
59+
SelectBestFeatures(method=ReliefF(), k=num_attr_after_selection),
60+
]
61+
62+
super().__init__(preprocessors=preprocessors)
63+
64+
def incompatibility_reason(self, domain):
65+
reason = None
66+
if len(domain.class_vars) > 1 and not self.supports_multiclass:
67+
reason = "Too many target variables."
68+
elif not domain.has_discrete_class:
69+
reason = "Categorical class variable expected."
70+
elif len(domain.class_vars[0].values) > 2:
71+
reason = "Too many target variable values."
72+
return reason
73+
74+
def fit_storage(self, table):
75+
if not isinstance(table, Storage):
76+
raise TypeError("Data is not a subclass of Orange.data.Storage.")
77+
elif table.get_nan_count_class() > 0:
78+
raise ValueError("Class variable contains missing values.")
79+
80+
if self.num_input_features is not None:
81+
self._generate_feature_group_index(table)
82+
83+
X, y, _ = table.X, table.Y, table.W if table.has_weights() else None
84+
learner = RiskScoreOptimizer(
85+
X=X,
86+
y=_change_class_var_values(y),
87+
k=self.num_decision_params,
88+
select_top_m=1,
89+
lb=-self.max_points_per_param,
90+
ub=self.max_points_per_param,
91+
group_sparsity=self.num_input_features,
92+
featureIndex_to_groupIndex=self.feature_to_group,
93+
)
94+
95+
self._optimize_decision_params_adjustment(learner)
96+
97+
multipliers, intercepts, coefficients = learner.get_models()
98+
99+
model = RiskScoreClassifier(
100+
multiplier=multipliers[0],
101+
intercept=intercepts[0],
102+
coefficients=coefficients[0],
103+
featureNames=[attribute.name for attribute in table.domain.attributes],
104+
X_train=X if self.num_decision_params > 10 else None,
105+
)
106+
107+
return ScoringSheetModel(model)
108+
109+
def _optimize_decision_params_adjustment(self, learner):
110+
"""
111+
This function attempts to optimize (fit) the learner, reducing the number of decision
112+
parameters ('k')if optimization fails due to being too high.
113+
114+
Sometimes, the number of decision parameters is too high for the
115+
number of input features. Which results in a ValueError.
116+
Continues until successful or 'k' cannot be reduced further.
117+
"""
118+
while True:
119+
try:
120+
learner.optimize()
121+
return True
122+
except ValueError as e:
123+
learner.k -= 1
124+
if learner.k < 1:
125+
# Raise a custom error when k falls below 1
126+
raise ValueError(
127+
"The number of input features is too low for the current settings."
128+
) from e
129+
130+
def _generate_feature_group_index(self, table):
131+
"""
132+
Returns a feature index to group index mapping. The group index is used to group
133+
binarized features that belong to the same original feature.
134+
"""
135+
original_feature_names = [
136+
attribute.compute_value.variable.name
137+
for attribute in table.domain.attributes
138+
]
139+
feature_to_group_index = {
140+
feature: idx for idx, feature in enumerate(set(original_feature_names))
141+
}
142+
feature_to_group = [
143+
feature_to_group_index[feature] for feature in original_feature_names
144+
]
145+
self.feature_to_group = np.asarray(feature_to_group)
146+
147+
148+
if __name__ == "__main__":
149+
mock_learner = ScoringSheetLearner(20, 5, 10, None)
150+
mock_table = Table("https://datasets.biolab.si/core/heart_disease.tab")
151+
mock_model = mock_learner(mock_table)
152+
mock_model(mock_table)

Orange/classification/utils/__init__.py

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
3+
BSD 3-Clause License
4+
5+
Copyright (c) 2022, Jiachang Liu
6+
All rights reserved.
7+
8+
Redistribution and use in source and binary forms, with or without
9+
modification, are permitted provided that the following conditions are met:
10+
11+
* Redistributions of source code must retain the above copyright notice, this
12+
list of conditions and the following disclaimer.
13+
14+
* Redistributions in binary form must reproduce the above copyright notice,
15+
this list of conditions and the following disclaimer in the documentation
16+
and/or other materials provided with the distribution.
17+
18+
* Neither the name of the copyright holder nor the names of its
19+
contributors may be used to endorse or promote products derived from
20+
this software without specific prior written permission.
21+
22+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Notice for Use of FasterRisk Code in Orange3
2+
3+
This directory ('Orange/classification/fasterrisk') contains code from the "FasterRisk" project by Jiachang Liu. This code is used under the BSD 3-Clause License. The source of this code can be found at https://github.com/jiachangliu/FasterRisk.
4+
5+
The inclusion of the FasterRisk code in this project serves as a temporary solution to address compatibility and functionality issues arising from the strict requirements of the original package. This measure will remain in place until such time as the original maintainer updates the package to address these issues.
6+
7+
A copy of the BSD 3-Clause License under which the FasterRisk code is licensed is included in this directory.

Orange/classification/utils/fasterrisk/__init__.py

Whitespace-only changes.
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import numpy as np
2+
import sys
3+
# import warnings
4+
# warnings.filterwarnings("ignore")
5+
from Orange.classification.utils.fasterrisk.utils import normalize_X, compute_logisticLoss_from_ExpyXB
6+
7+
class logRegModel:
8+
def __init__(self, X, y, lambda2=1e-8, intercept=True, original_lb=-5, original_ub=5):
9+
self.X = X
10+
self.X_normalized, self.X_mean, self.X_norm, self.scaled_feature_indices = normalize_X(self.X)
11+
self.n, self.p = self.X_normalized.shape
12+
self.y = y.reshape(-1).astype(float)
13+
self.yX = y.reshape(-1, 1) * self.X_normalized
14+
self.yXT = np.zeros((self.p, self.n))
15+
self.yXT[:] = np.transpose(self.yX)[:]
16+
self.beta0 = 0
17+
self.betas = np.zeros((self.p, ))
18+
self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
19+
20+
self.intercept = intercept
21+
self.lambda2 = lambda2
22+
self.twoLambda2 = 2 * self.lambda2
23+
24+
self.Lipschitz = 0.25 + self.twoLambda2
25+
self.lbs = original_lb * np.ones(self.p)
26+
self.lbs[self.scaled_feature_indices] *= self.X_norm[self.scaled_feature_indices]
27+
self.ubs = original_ub * np.ones(self.p)
28+
self.ubs[self.scaled_feature_indices] *= self.X_norm[self.scaled_feature_indices]
29+
30+
self.total_child_added = 0
31+
32+
def warm_start_from_original_beta0_betas(self, original_beta0, original_betas):
33+
# betas_initial has dimension (p+1, 1)
34+
self.original_beta0 = original_beta0
35+
self.original_betas = original_betas
36+
self.beta0, self.betas = self.transform_coefficients_to_normalized_space(self.original_beta0, self.original_betas)
37+
print("warmstart solution in normalized space is {} and {}".format(self.beta0, self.betas))
38+
self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
39+
40+
def warm_start_from_beta0_betas(self, beta0, betas):
41+
self.beta0, self.betas = beta0, betas
42+
self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
43+
44+
def warm_start_from_beta0_betas_ExpyXB(self, beta0, betas, ExpyXB):
45+
self.beta0, self.betas, self.ExpyXB = beta0, betas, ExpyXB
46+
47+
def get_beta0_betas(self):
48+
return self.beta0, self.betas
49+
50+
def get_beta0_betas_ExpyXB(self):
51+
return self.beta0, self.betas, self.ExpyXB
52+
53+
def get_original_beta0_betas(self):
54+
return self.transform_coefficients_to_original_space(self.beta0, self.betas)
55+
56+
def transform_coefficients_to_original_space(self, beta0, betas):
57+
original_betas = betas.copy()
58+
original_betas[self.scaled_feature_indices] = original_betas[self.scaled_feature_indices]/self.X_norm[self.scaled_feature_indices]
59+
original_beta0 = beta0 - np.dot(self.X_mean, original_betas)
60+
return original_beta0, original_betas
61+
62+
def transform_coefficients_to_normalized_space(self, original_beta0, original_betas):
63+
betas = original_betas.copy()
64+
betas[self.scaled_feature_indices] = betas[self.scaled_feature_indices] * self.X_norm[self.scaled_feature_indices]
65+
beta0 = original_beta0 + self.X_mean.dot(original_betas)
66+
return beta0, betas
67+
68+
def get_grad_at_coord(self, ExpyXB, betas_j, yX_j, j):
69+
# return -np.dot(1/(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
70+
# return -np.inner(1/(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
71+
# return -np.inner(np.reciprocal(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
72+
return -np.inner(np.reciprocal(1+ExpyXB), yX_j) + self.twoLambda2 * betas_j
73+
# return -yX_j.dot(np.reciprocal(1+ExpyXB)) + self.twoLambda2 * betas_j
74+
75+
def update_ExpyXB(self, ExpyXB, yX_j, diff_betas_j):
76+
ExpyXB *= np.exp(yX_j * diff_betas_j)
77+
78+
def optimize_1step_at_coord(self, ExpyXB, betas, yX_j, j):
79+
# in-place modification, heck that ExpyXB and betas are passed by reference
80+
prev_betas_j = betas[j]
81+
current_betas_j = prev_betas_j
82+
grad_at_j = self.get_grad_at_coord(ExpyXB, current_betas_j, yX_j, j)
83+
step_at_j = grad_at_j / self.Lipschitz
84+
current_betas_j = prev_betas_j - step_at_j
85+
# current_betas_j = np.clip(current_betas_j, self.lbs[j], self.ubs[j])
86+
current_betas_j = max(self.lbs[j], min(self.ubs[j], current_betas_j))
87+
diff_betas_j = current_betas_j - prev_betas_j
88+
betas[j] = current_betas_j
89+
90+
# ExpyXB *= np.exp(yX_j * diff_betas_j)
91+
self.update_ExpyXB(ExpyXB, yX_j, diff_betas_j)
92+
93+
def finetune_on_current_support(self, ExpyXB, beta0, betas, total_CD_steps=100):
94+
95+
support = np.where(np.abs(betas) > 1e-9)[0]
96+
grad_on_support = -self.yXT[support].dot(np.reciprocal(1+ExpyXB)) + self.twoLambda2 * betas[support]
97+
abs_grad_on_support = np.abs(grad_on_support)
98+
support = support[np.argsort(-abs_grad_on_support)]
99+
100+
loss_before = compute_logisticLoss_from_ExpyXB(ExpyXB) + self.lambda2 * betas[support].dot(betas[support])
101+
for steps in range(total_CD_steps): # number of iterations for coordinate descent
102+
103+
if self.intercept:
104+
grad_intercept = -np.reciprocal(1+ExpyXB).dot(self.y)
105+
step_at_intercept = grad_intercept / (self.n * 0.25) # lipschitz constant is 0.25 at the intercept
106+
beta0 = beta0 - step_at_intercept
107+
ExpyXB *= np.exp(self.y * (-step_at_intercept))
108+
109+
for j in support:
110+
self.optimize_1step_at_coord(ExpyXB, betas, self.yXT[j, :], j) # in-place modification on ExpyXB and betas
111+
112+
if steps % 10 == 0:
113+
loss_after = compute_logisticLoss_from_ExpyXB(ExpyXB) + self.lambda2 * betas[support].dot(betas[support])
114+
if abs(loss_before - loss_after)/loss_after < 1e-8:
115+
# print("break after {} steps; support size is {}".format(steps, len(support)))
116+
break
117+
loss_before = loss_after
118+
119+
return ExpyXB, beta0, betas
120+
121+
def compute_yXB(self, beta0, betas):
122+
return self.y*(beta0 + np.dot(self.X_normalized, betas))
123+

0 commit comments

Comments
 (0)