Skip to content

Commit 583695f

Browse files
Rudraksh TuwaniRudraksh Tuwani
authored andcommitted
dre module draft
1 parent c49e8a6 commit 583695f

File tree

1 file changed

+267
-0
lines changed

1 file changed

+267
-0
lines changed

mapie/dre.py

Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
from __future__ import annotations
2+
3+
from typing import Optional
4+
import numpy as np
5+
from sklearn.base import ClassifierMixin
6+
from sklearn.linear_model import LogisticRegression
7+
from sklearn.pipeline import Pipeline
8+
from sklearn.utils.validation import check_is_fitted
9+
from mapie._typing import ArrayLike
10+
11+
12+
class DensityRatioEstimator():
13+
""" Template class for density ratio estimation. """
14+
15+
def __init__(self):
16+
pass
17+
18+
def fit(self):
19+
pass
20+
21+
def predict(self):
22+
pass
23+
24+
def check_is_fitted(self):
25+
pass
26+
27+
28+
class ProbClassificationDRE(DensityRatioEstimator):
29+
"""
30+
Density ratio estimation by classification.
31+
32+
This class implements the density ratio estimation by classification
33+
strategy. The broad idea is to first learn a discriminative classifier to
34+
distinguish between source and target datasets, and then use the class
35+
probability estimates from the classifier to estimate the density ratio.
36+
37+
Parameters
38+
----------
39+
estimator: Optional[ClassifierMixin]
40+
Any classifier with scikit-learn API
41+
(i.e. with fit, predict, and predict_proba methods), by default ``None``.
42+
If ``None``, estimator defaults to a ``LogisticRegression`` instance.
43+
44+
clip_min: Optional[float]
45+
Lower bound the probability estimate from the classifier to
46+
``clip_min``. If ``None``, the estimates are not lower bounded.
47+
48+
By default ``None``.
49+
50+
clip_max: Optional[float]
51+
Upper bound the probability estimate from the classifier to
52+
``clip_max``. If ``None``, the estimates are not upper bounded.
53+
54+
By default ``None``.
55+
56+
Attributes
57+
----------
58+
source_prob: float
59+
The marginal probability of getting a datapoint from the source
60+
distribution.
61+
62+
target_prob: float
63+
The marginal probability of getting a datapoint from the target
64+
distribution.
65+
66+
References
67+
----------
68+
69+
Examples
70+
--------
71+
72+
"""
73+
74+
def __init__(
75+
self,
76+
estimator: Optional[ClassifierMixin] = None,
77+
clip_min: Optional[float] = None,
78+
clip_max: Optional[float] = None,
79+
) -> None:
80+
81+
self.estimator = self._check_estimator(estimator)
82+
83+
if self.clip_max is None:
84+
self.clip_max = 1
85+
elif all((clip_max >= 0, clip_max <= 1)):
86+
self.clip_max = clip_max
87+
else:
88+
raise ValueError("Expected `clip_max` to be between 0 and 1.")
89+
90+
if self.clip_min is None:
91+
self.clip_min = 0
92+
elif all((clip_min >= 0, clip_min <= clip_max)):
93+
self.clip_min = clip_min
94+
else:
95+
raise ValueError(
96+
"Expected `clip_min` to be between 0 and `clip_max`.")
97+
98+
def _check_estimator(
99+
self,
100+
estimator: Optional[ClassifierMixin] = None,
101+
) -> ClassifierMixin:
102+
"""
103+
Check if estimator is ``None``,
104+
and returns a ``LogisticRegression`` instance if necessary.
105+
106+
Parameters
107+
----------
108+
estimator : Optional[ClassifierMixin], optional
109+
Estimator to check, by default ``None``
110+
111+
Returns
112+
-------
113+
ClassifierMixin
114+
The estimator itself or a default ``LogisticRegression`` instance.
115+
116+
Raises
117+
------
118+
ValueError
119+
If the estimator is not ``None``
120+
and has no fit, predict, nor predict_proba methods.
121+
"""
122+
if estimator is None:
123+
return LogisticRegression(class_weight="balanced", random_state=0)
124+
125+
if isinstance(estimator, Pipeline):
126+
est = estimator[-1]
127+
else:
128+
est = estimator
129+
if (
130+
not hasattr(est, "fit")
131+
and not hasattr(est, "predict")
132+
and not hasattr(est, "predict_proba")
133+
):
134+
raise ValueError(
135+
"Invalid estimator. "
136+
"Please provide a classifier with fit,"
137+
"predict, and predict_proba methods."
138+
)
139+
140+
return estimator
141+
142+
def fit(
143+
self,
144+
X_source: ArrayLike,
145+
X_target: ArrayLike,
146+
source_prob: Optional[float] = None,
147+
target_prob: Optional[float] = None,
148+
sample_weight: Optional[ArrayLike] = None
149+
) -> ProbClassificationDRE:
150+
"""
151+
Fit the discriminative classifier to source and target samples.
152+
153+
Parameters
154+
----------
155+
X_source: ArrayLike of shape (n_source_samples, n_features)
156+
Training data.
157+
158+
X_target: ArrayLike of shape (n_target_samples, n_features)
159+
Training data.
160+
161+
source_prob: Optional[float]
162+
The marginal probability of getting a datapoint from the source
163+
distribution. If ``None``, the proportion of source examples in
164+
the training dataset is used.
165+
166+
By default ``None``.
167+
168+
target_prob: Optional[float]
169+
The marginal probability of getting a datapoint from the target
170+
distribution. If ``None``, the proportion of target examples in
171+
the training dataset is used.
172+
173+
By default ``None``.
174+
175+
sample_weight : Optional[ArrayLike] of shape (n_source_samples + n_target_samples,)
176+
Sample weights for fitting the out-of-fold models.
177+
If ``None``, then samples are equally weighted.
178+
If some weights are null,
179+
their corresponding observations are removed
180+
before the fitting process and hence have no prediction sets.
181+
182+
By default ``None``.
183+
184+
Returns
185+
-------
186+
ProbClassificationDRE
187+
The density ratio estimator itself.
188+
"""
189+
190+
# Find the marginal source and target probability.
191+
n_source = X_source.shape[0]
192+
n_target = X_target.shape[0]
193+
194+
if source_prob is None:
195+
source_prob = self.n_source/(self.n_source + self.n_target)
196+
197+
if target_prob is None:
198+
target_prob = self.n_target/(self.n_source + self.n_target)
199+
200+
if source_prob + target_prob != 1:
201+
raise ValueError(
202+
"``source_prob`` and ``target_prob`` do not add up to 1.")
203+
204+
# Estimate the conditional probability of source/target given X.
205+
X = np.concatenate((X_source, X_target), axis=0)
206+
y = np.concatenate((np.zeros(n_source), np.ones(n_target)), axis=0)
207+
208+
if type(self.estimator) == Pipeline:
209+
step_name = self.estimator.steps[-1][0]
210+
self.estimator.fit(
211+
X, y, **{f'{step_name}__sample_weight': sample_weight})
212+
else:
213+
self.estimator.fit(X, y, sample_weight=sample_weight)
214+
215+
return self
216+
217+
def predict(
218+
self,
219+
X: ArrayLike,
220+
) -> ArrayLike:
221+
"""
222+
Predict the density ratio estimates for new samples.
223+
224+
Parameters
225+
----------
226+
X: ArrayLike of shape (n_samples, n_features)
227+
Samples to get the density ratio estimates for.
228+
229+
Returns
230+
-------
231+
ProbClassificationDRE
232+
The density ratio estimtor itself.
233+
"""
234+
235+
# Some models in sklearn have predict_proba but not predict_log_proba.
236+
if not hasattr(self.estimator, "predict_log_proba"):
237+
probs = self.estimator.predict_proba(X)
238+
log_probs = np.log(probs)
239+
else:
240+
log_probs = self.estimator.predict_log_proba(X)
241+
242+
# Clip prob to mitigate extremely high or low dre.
243+
log_probs = np.clip(log_probs, a_min=np.log(
244+
self.clip_min), a_max=np.log(self.clip_max))
245+
246+
return np.exp(log_probs[:, 1] - log_probs[:, 0] + np.log(self.source_prob) - np.log(self.target_prob))
247+
248+
def check_is_fitted(self):
249+
if isinstance(self.estimator, Pipeline):
250+
check_is_fitted(self.estimator[-1])
251+
else:
252+
check_is_fitted(self.estimator)
253+
254+
255+
def calculate_ess(weights: ArrayLike) -> float:
256+
"""
257+
Calculates the effective sample size given importance weights for the
258+
source distribution.
259+
260+
Parameters
261+
----------
262+
weights: ArrayLike
263+
Importance weights for the examples in source distribution.
264+
"""
265+
num = weights.sum()**2
266+
denom = (weights**2).sum()
267+
return num/denom

0 commit comments

Comments
 (0)