Skip to content

Commit 4f63b57

Browse files
authored
Add sample weight #minor (#56)
* Add ray to tests * Add sample_weight parameter
1 parent cad0bdf commit 4f63b57

13 files changed

+110
-33
lines changed

.github/workflows/deploy-pypi.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ jobs:
2828
python -m pip install pytest-flake8==1.1.1
2929
python -m pip install pytest-pydocstyle==2.3.0
3030
python -m pip install pytest-cov==3.0.0
31+
python -m pip install ray==1.13.0
3132
python -m pip install .
3233
- name: Test with pytest
3334
run: |

.github/workflows/test-pr.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
python -m pip install pytest-flake8==1.1.1
3030
python -m pip install pytest-pydocstyle==2.3.0
3131
python -m pip install pytest-cov==3.0.0
32+
python -m pip install ray==1.13.0
3233
python -m pip install .
3334
- name: Test with pytest
3435
run: |

hiclass/BinaryPolicy.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ class BinaryPolicy(ABC):
1313
Every policy should implement the methods positive_examples and negative_examples.
1414
"""
1515

16-
def __init__(self, digraph: nx.DiGraph, X: np.ndarray, y: np.ndarray):
16+
def __init__(
17+
self, digraph: nx.DiGraph, X: np.ndarray, y: np.ndarray, sample_weight=None
18+
):
1719
"""
1820
Initialize a BinaryPolicy with the required data.
1921
@@ -26,10 +28,14 @@ def __init__(self, digraph: nx.DiGraph, X: np.ndarray, y: np.ndarray):
2628
y : np.ndarray
2729
Labels which will be assigned to the different samples.
2830
Has to be 2D array.
31+
sample_weight : array-like of shape (n_samples,), default=None
32+
Array of weights that are assigned to individual samples.
33+
If not provided, then each sample is given unit weight.
2934
"""
3035
self.digraph = digraph
3136
self.X = X
3237
self.y = y
38+
self.sample_weight = sample_weight
3339

3440
def positive_examples(self, node) -> np.ndarray:
3541
"""
@@ -128,15 +134,35 @@ def get_binary_examples(self, node) -> tuple:
128134
negative_examples = self.negative_examples(node)
129135
positive_x = self.X[positive_examples]
130136
negative_x = self.X[negative_examples]
137+
positive_weights = (
138+
self.sample_weight[positive_examples]
139+
if self.sample_weight is not None
140+
else None
141+
)
142+
negative_weights = (
143+
self.sample_weight[negative_examples]
144+
if self.sample_weight is not None
145+
else None
146+
)
131147
if isinstance(self.X, np.ndarray):
132148
X = np.concatenate([positive_x, negative_x])
149+
sample_weights = (
150+
np.concatenate([positive_weights, negative_weights])
151+
if self.sample_weight is not None
152+
else None
153+
)
133154
y = np.zeros(len(X))
134155
y[: len(positive_x)] = 1
135156
elif isinstance(self.X, csr_matrix):
136157
X = vstack([positive_x, negative_x])
158+
sample_weights = (
159+
vstack([positive_weights, negative_weights])
160+
if self.sample_weight is not None
161+
else None
162+
)
137163
y = np.zeros(X.shape[0])
138164
y[: positive_x.shape[0]] = 1
139-
return X, y
165+
return X, y, sample_weights
140166

141167

142168
class ExclusivePolicy(BinaryPolicy):

hiclass/ConstantClassifier.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
class ConstantClassifier:
66
"""A classifier that always returns the only label seen during fit."""
77

8-
def fit(self, X, y):
8+
def fit(self, X, y, sample_weight=None):
99
"""
1010
Fit a constant classifier.
1111
@@ -17,6 +17,9 @@ def fit(self, X, y):
1717
converted into a sparse ``csc_matrix``.
1818
y : array-like of shape (n_samples, n_levels)
1919
The target values, i.e., hierarchical class labels for classification.
20+
sample_weight : array-like of shape (n_samples,), default=None
21+
Array of weights that are assigned to individual samples.
22+
If not provided, then each sample is given unit weight.
2023
2124
Returns
2225
-------

hiclass/HierarchicalClassifier.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from joblib import Parallel, delayed
88
from sklearn.base import BaseEstimator
99
from sklearn.linear_model import LogisticRegression
10+
from sklearn.utils.validation import _check_sample_weight
1011

1112
try:
1213
import ray
@@ -96,7 +97,7 @@ def __init__(
9697
self.n_jobs = n_jobs
9798
self.classifier_abbreviation = classifier_abbreviation
9899

99-
def fit(self, X, y):
100+
def fit(self, X, y, sample_weight=None):
100101
"""
101102
Fit a local hierarchical classifier.
102103
@@ -110,6 +111,9 @@ def fit(self, X, y):
110111
converted into a sparse ``csc_matrix``.
111112
y : array-like of shape (n_samples, n_levels)
112113
The target values, i.e., hierarchical class labels for classification.
114+
sample_weight : array-like of shape (n_samples,), default=None
115+
Array of weights that are assigned to individual samples.
116+
If not provided, then each sample is given unit weight.
113117
114118
Returns
115119
-------
@@ -122,14 +126,19 @@ def fit(self, X, y):
122126
# Delete unnecessary variables
123127
self._clean_up()
124128

125-
def _pre_fit(self, X, y):
129+
def _pre_fit(self, X, y, sample_weight):
126130
# Check that X and y have correct shape
127131
# and convert them to np.ndarray if need be
128132

129133
self.X_, self.y_ = self._validate_data(
130134
X, y, multi_output=True, accept_sparse="csr"
131135
)
132136

137+
if sample_weight is not None:
138+
self.sample_weight_ = _check_sample_weight(sample_weight, X)
139+
else:
140+
self.sample_weight_ = None
141+
133142
self.y_ = make_leveled(self.y_)
134143

135144
# Create and configure logger
@@ -329,3 +338,5 @@ def _clean_up(self):
329338
self.logger_.info("Cleaning up variables that can take a lot of disk space")
330339
del self.X_
331340
del self.y_
341+
if self.sample_weight_ is not None:
342+
del self.sample_weight_

hiclass/LocalClassifierPerLevel.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def __init__(
7878
classifier_abbreviation="LCPL",
7979
)
8080

81-
def fit(self, X, y):
81+
def fit(self, X, y, sample_weight=None):
8282
"""
8383
Fit a local classifier per level.
8484
@@ -90,14 +90,17 @@ def fit(self, X, y):
9090
converted into a sparse ``csc_matrix``.
9191
y : array-like of shape (n_samples, n_levels)
9292
The target values, i.e., hierarchical class labels for classification.
93+
sample_weight : array-like of shape (n_samples,), default=None
94+
Array of weights that are assigned to individual samples.
95+
If not provided, then each sample is given unit weight.
9396
9497
Returns
9598
-------
9699
self : object
97100
Fitted estimator.
98101
"""
99102
# Execute common methods necessary before fitting
100-
super()._pre_fit(X, y)
103+
super()._pre_fit(X, y, sample_weight)
101104

102105
# Fit local classifiers in DAG
103106
super().fit(X, y)
@@ -232,17 +235,22 @@ def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):
232235
def _fit_classifier(self, level, separator):
233236
classifier = self.local_classifiers_[level]
234237

235-
X, y = self._remove_empty_leaves(separator, self.X_, self.y_[:, level])
238+
X, y, sample_weight = self._remove_empty_leaves(
239+
separator, self.X_, self.y_[:, level], self.sample_weight_
240+
)
236241

237242
unique_y = np.unique(y)
238243
if len(unique_y) == 1 and self.replace_classifiers:
239244
classifier = ConstantClassifier()
240-
classifier.fit(X, y)
245+
classifier.fit(X, y, sample_weight)
241246
return classifier
242247

243248
@staticmethod
244-
def _remove_empty_leaves(separator, X, y):
249+
def _remove_empty_leaves(separator, X, y, sample_weight):
245250
# Detect rows where leaves are not empty
246251
leaves = np.array([str(i).split(separator)[-1] for i in y])
247252
mask = leaves != ""
248-
return X[mask], y[mask]
253+
X = X[mask]
254+
y = y[mask]
255+
sample_weight = sample_weight[mask] if sample_weight is not None else None
256+
return X, y, sample_weight

hiclass/LocalClassifierPerNode.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __init__(
8585
)
8686
self.binary_policy = binary_policy
8787

88-
def fit(self, X, y):
88+
def fit(self, X, y, sample_weight=None):
8989
"""
9090
Fit a local classifier per node.
9191
@@ -97,14 +97,17 @@ def fit(self, X, y):
9797
converted into a sparse ``csc_matrix``.
9898
y : array-like of shape (n_samples, n_levels)
9999
The target values, i.e., hierarchical class labels for classification.
100+
sample_weight : array-like of shape (n_samples,), default=None
101+
Array of weights that are assigned to individual samples.
102+
If not provided, then each sample is given unit weight.
100103
101104
Returns
102105
-------
103106
self : object
104107
Fitted estimator.
105108
"""
106109
# Execute common methods necessary before fitting
107-
super()._pre_fit(X, y)
110+
super()._pre_fit(X, y, sample_weight)
108111

109112
# Initialize policy
110113
self._initialize_binary_policy()
@@ -192,7 +195,7 @@ def _initialize_binary_policy(self):
192195
try:
193196
self.binary_policy_ = BinaryPolicy.IMPLEMENTED_POLICIES[
194197
self.binary_policy.lower()
195-
](self.hierarchy_, self.X_, self.y_)
198+
](self.hierarchy_, self.X_, self.y_, self.sample_weight_)
196199
except KeyError:
197200
self.logger_.error(
198201
f"Policy {self.binary_policy} not implemented. Available policies are:\n"
@@ -226,11 +229,11 @@ def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):
226229
@staticmethod
227230
def _fit_classifier(self, node):
228231
classifier = self.hierarchy_.nodes[node]["classifier"]
229-
X, y = self.binary_policy_.get_binary_examples(node)
232+
X, y, sample_weight = self.binary_policy_.get_binary_examples(node)
230233
unique_y = np.unique(y)
231234
if len(unique_y) == 1 and self.replace_classifiers:
232235
classifier = ConstantClassifier()
233-
classifier.fit(X, y)
236+
classifier.fit(X, y, sample_weight)
234237
return classifier
235238

236239
def _clean_up(self):

hiclass/LocalClassifierPerParentNode.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def __init__(
7171
classifier_abbreviation="LCPPN",
7272
)
7373

74-
def fit(self, X, y):
74+
def fit(self, X, y, sample_weight=None):
7575
"""
7676
Fit a local classifier per parent node.
7777
@@ -83,14 +83,17 @@ def fit(self, X, y):
8383
converted into a sparse ``csc_matrix``.
8484
y : array-like of shape (n_samples, n_levels)
8585
The target values, i.e., hierarchical class labels for classification.
86+
sample_weight : array-like of shape (n_samples,), default=None
87+
Array of weights that are assigned to individual samples.
88+
If not provided, then each sample is given unit weight.
8689
8790
Returns
8891
-------
8992
self : object
9093
Fitted estimator.
9194
"""
9295
# Execute common methods necessary before fitting
93-
super()._pre_fit(X, y)
96+
super()._pre_fit(X, y, sample_weight)
9497

9598
# Fit local classifiers in DAG
9699
super().fit(X, y)
@@ -187,17 +190,20 @@ def _get_successors(self, node):
187190
else:
188191
y.append(row[np.where(row == node)[0][0] + 1])
189192
y = np.array(y)
190-
return X, y
193+
sample_weight = (
194+
self.sample_weight_[mask] if self.sample_weight_ is not None else None
195+
)
196+
return X, y, sample_weight
191197

192198
@staticmethod
193199
def _fit_classifier(self, node):
194200
classifier = self.hierarchy_.nodes[node]["classifier"]
195201
# get children examples
196-
X, y = self._get_successors(node)
202+
X, y, sample_weight = self._get_successors(node)
197203
unique_y = np.unique(y)
198204
if len(unique_y) == 1 and self.replace_classifiers:
199205
classifier = ConstantClassifier()
200-
classifier.fit(X, y)
206+
classifier.fit(X, y, sample_weight)
201207
return classifier
202208

203209
def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):

0 commit comments

Comments
 (0)