55computed with guarantees.
66
77.. warning::
8- This method is extremely inefficient. Potential improvements to the
8+ This method is very inefficient. Potential improvements to the
99 implementation notwithstanding, convergence seems to be very slow (in terms
1010 of evaluations of the utility required). We recommend other Monte Carlo
1111 methods instead.
@@ -43,6 +43,20 @@ def _constants(
4343 """A helper function returning the constants for the algorithm. Pretty ugly,
4444 yes.
4545
46+ :param n: The number of data points.
47+ :param epsilon: The error tolerance.
48+ :param delta: The confidence level.
49+ :param utility_range: The range of the utility function.
50+
51+ :return: A namedtuple with the constants. The fields are the same as in the
52+ paper:
53+ - kk: the sample sizes (i.e. an array of 1, 2, ..., n - 1)
54+ - Z: the normalization constant
55+ - q: the probability of drawing a sample of size k
56+ - q_tot: another normalization constant
57+ - T: the number of iterations. This will be -1 if the utility_range is
58+ infinite. E.g. because the :class:`~pydvl.utils.score.Scorer` does
59+ not define a range.
4660 """
4761 r = utility_range
4862
@@ -86,7 +100,7 @@ def num_samples_eps_delta(
86100
87101 :param eps: ε
88102 :param delta: δ
89- :param n: Number of samples
103+ :param n: Number of data points
90104 :param utility_range: Range of the :class:`~pydvl.utils.utility.Utility`
91105 function
92106 :return: Number of samples from $2^{[n]}$ guaranteeing ε/√n-correct Shapley
@@ -110,8 +124,7 @@ def _group_testing_shapley(
110124 :param u: Utility object with model, data, and scoring function.
111125 :param n_samples: total number of samples (subsets) to use.
112126 :param progress: Whether to display progress bars for each job.
113- :param job_id: id to use for reporting progress (e.g. to place
114- progres bars)
127+ :param job_id: id to use for reporting progress (e.g. to place progres bars)
115128 :return:
116129 """
117130 rng = np .random .default_rng ()
@@ -144,8 +157,8 @@ def group_testing_shapley(
144157 in :footcite:t:`jia_efficient_2019`.
145158
146159 .. warning::
147- This method is extremely inefficient. It requires several orders of
148- magnitude more evaluations of the utility than others in
160+ This method is very inefficient. It requires several orders of magnitude
161+ more evaluations of the utility than others in
149162 :mod:`~pydvl.value.shapley.montecarlo`. It also uses several intermediate
150163 objects like the results from the runners and the constraint matrices
151164 which can become rather large.
@@ -179,6 +192,7 @@ def group_testing_shapley(
179192 """
180193
181194 n = len (u .data .indices )
195+
182196 const = _constants (
183197 n = n ,
184198 epsilon = epsilon ,
0 commit comments