Skip to content

Commit f16e174

Browse files
authored
Merge branch 'develop' into dependabot/pip/transformers-4.48.0
2 parents 42381ba + 5962321 commit f16e174

File tree

178 files changed

+11174
-4833
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

178 files changed

+11174
-4833
lines changed

.github/workflows/run-notebook-tests-workflow.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ jobs:
2323
run-tests:
2424
runs-on: ubuntu-22.04
2525
steps:
26+
- name: Free Disk Space (Ubuntu)
27+
uses: jlumbroso/free-disk-space@main
28+
with:
29+
large-packages: false
30+
docker-images: false
2631
- uses: actions/checkout@v4
2732
with:
2833
fetch-depth: 0

.notebook_test_durations

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"notebooks/influence_synthetic.ipynb::": 7.191153166999996,
66
"notebooks/influence_wine.ipynb::": 11.610076332999995,
77
"notebooks/least_core_basic.ipynb::": 14.069404709000011,
8-
"notebooks/least_core_basic_new.ipynb::": 24.492538208000013,
98
"notebooks/msr_banzhaf_digits.ipynb::": 86.62082037599998,
109
"notebooks/shapley_basic_spotify.ipynb::": 15.088616748999982,
1110
"notebooks/shapley_knn_flowers.ipynb::": 6.810235208000023,

.test_durations

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,8 +1521,8 @@
15211521
"tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs1-10]": 0.0016590010000072652,
15221522
"tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-100]": 0.0022294990000091275,
15231523
"tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-10]": 0.003863207999984297,
1524-
"tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-100]": 0.001800666000065121,
1525-
"tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-10]": 0.0016530420000435697,
1524+
"tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-100]": 0.001800666000065121,
1525+
"tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-10]": 0.0016530420000435697,
15261526
"tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-100]": 0.0018769589999578784,
15271527
"tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-10]": 0.0016063749999375432,
15281528
"tests/valuation/methods/test_semivalues.py::test_msr_banzhaf[5]": 9.342398666999998,
@@ -1636,10 +1636,10 @@
16361636
"tests/valuation/scorers/test_classwise.py::test_classwise_scorer[test_data2-expected_scores2]": 0.0025690839999974457,
16371637
"tests/valuation/scorers/test_scorers.py::test_compose_score": 0.0019082069999996065,
16381638
"tests/valuation/scorers/test_scorers.py::test_scorer": 0.001976999999998341,
1639-
"tests/valuation/test_interface.py::test_data_banzhaf_valuation[1]": 0.0836418330000015,
1640-
"tests/valuation/test_interface.py::test_data_banzhaf_valuation[2]": 1.2780167490000025,
1641-
"tests/valuation/test_interface.py::test_data_beta_shapley_valuation[1]": 4.139234666999997,
1642-
"tests/valuation/test_interface.py::test_data_beta_shapley_valuation[2]": 3.603092916999998,
1639+
"tests/valuation/test_interface.py::test_banzhaf_valuation[1]": 0.0836418330000015,
1640+
"tests/valuation/test_interface.py::test_banzhaf_valuation[2]": 1.2780167490000025,
1641+
"tests/valuation/test_interface.py::test_beta_shapley_valuation[1]": 4.139234666999997,
1642+
"tests/valuation/test_interface.py::test_beta_shapley_valuation[2]": 3.603092916999998,
16431643
"tests/valuation/test_interface.py::test_shapley_valuation[1]": 0.27120083299999465,
16441644
"tests/valuation/test_interface.py::test_shapley_valuation[2]": 0.15037520699999618,
16451645
"tests/valuation/test_interface.py::test_data_utility_learning[1]": 0.026216332999993597,

CHANGELOG.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55

66
### Added
77

8+
- Simple memory monitor / reporting
9+
[PR #663](https://github.com/aai-institute/pyDVL/pull/663)
10+
- New stopping criterion `MaxSamples`
11+
[PR #661](https://github.com/aai-institute/pyDVL/pull/661)
12+
- Introduced `UtilityModel` and two implementations `IndicatorUtilityModel`
13+
and `DeepSetsUtilityModel` for data utility learning
14+
[PR #650](https://github.com/aai-institute/pyDVL/pull/650)
815
- Introduced the concept of `ResultUpdater` in order to allow samplers to
916
declare the proper strategy to use by valuations
1017
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -16,8 +23,9 @@
1623
[PR #636](https://github.com/aai-institute/pyDVL/pull/636)
1724
- Refactor Classwise Shapley valuation with the interfaces and sampler
1825
architecture [PR #616](https://github.com/aai-institute/pyDVL/pull/616)
19-
- Refactor KNN Shapley values with the new sampler architecture
26+
- Refactor KNN Shapley values with the new interface
2027
[PR #610](https://github.com/aai-institute/pyDVL/pull/610)
28+
[PR #645](https://github.com/aai-institute/pyDVL/pull/645)
2129
- Refactor MSR Banzhaf semivalues with the new sampler architecture.
2230
[PR #605](https://github.com/aai-institute/pyDVL/pull/605)
2331
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -52,9 +60,14 @@
5260

5361
### Fixed
5462

63+
- Fixed `show_warnings=False` not being respected in subprocesses. Introduced
64+
`suppress_warninigs` decorator for more flexibility
65+
[PR #647](https://github.com/aai-institute/pyDVL/pull/647)
66+
[PR #662](https://github.com/aai-institute/pyDVL/pull/662)
5567
- Fixed several bugs in diverse stopping criteria, including: iteration counts,
56-
computing completion and resetting
68+
computing completion, resetting, nested composition
5769
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
70+
[PR #650](https://github.com/aai-institute/pyDVL/pull/650)
5871
- Fixed all weights of all samplers to ensure that mix-and-matching samplers and
5972
semi-value methods always works, for all possible combinations
6073
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -76,13 +89,20 @@
7689

7790
### Changed
7891

92+
- Slicing, comparing and setting of `ValuationResult` behave in a more
93+
natural way
94+
[PR #660](https://github.com/aai-institute/pyDVL/pull/660)
7995
- Switched all semi-value coefficients and sampler weights to log-space in
8096
order to avoid overflows
8197
[PR #643](https://github.com/aai-institute/pyDVL/pull/643)
8298
- Updated and rewrote some of the MSR banzhaf notebook
8399
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
84100
- Updated Least-Core notebook
85101
[PR #641](https://github.com/aai-institute/pyDVL/pull/641)
102+
- Updated Shapley spotify notebook
103+
[PR #628](https://github.com/aai-institute/pyDVL/pull/628)
104+
- Updated Data Utility notebook
105+
[PR #650](https://github.com/aai-institute/pyDVL/pull/650)
86106
- Restructured and generalized `StratifiedSampler` to allow using heuristics,
87107
thus subsuming Variance-Reduced stratified sampling into a unified framework.
88108
Implemented the heuristics proposed in that paper
@@ -93,6 +113,7 @@
93113
`GroupedDataset`, fixing inconsistencies in how the latter operates on indices.
94114
Also, both now return objects of the same type when slicing.
95115
[PR #631](https://github.com/aai-institute/pyDVL/pull/631)
116+
[PR #648](https://github.com/aai-institute/pyDVL/pull/648)
96117
- Use tighter bounds for the calculation of the minimal sample size that guarantees
97118
an epsilon-delta approximation in group testing (Jia et al. 2023)
98119
[PR #602](https://github.com/aai-institute/pyDVL/pull/602)

CONTRIBUTING.md

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ If you are interested in setting up a similar project, consider the template
1515

1616
## Local development
1717

18-
This project uses [black](https://github.com/psf/black) to format code and
18+
This project uses [ruff](https://github.com/astral-sh/ruff) to lint and format code and
1919
[pre-commit](https://pre-commit.com/) to invoke it as a git pre-commit hook.
20-
Consider installing any of [black's IDE
21-
integrations](https://black.readthedocs.io/en/stable/integrations/editors.html)
20+
Consider installing any of [ruff's IDE
21+
integrations](https://docs.astral.sh/ruff/editors/setup/)
2222
to make your life easier.
2323

2424
Run the following to set up the pre-commit git hook to run before pushes:
@@ -83,7 +83,7 @@ If you use remote execution, don't forget to exclude data paths from deployment
8383
## Testing
8484

8585
Automated builds, tests, generation of documentation and publishing are handled
86-
by [CI pipelines](#CI). Before pushing your changes to the remote we recommend
86+
by [CI pipelines](#ci). Before pushing your changes to the remote we recommend
8787
to execute `tox` locally in order to detect mistakes early on and to avoid
8888
failing pipelines. tox will:
8989
* run the test suite
@@ -92,7 +92,7 @@ failing pipelines. tox will:
9292
* generate coverage reports in html, as well as badges.
9393

9494
You can configure pytest, coverage and ruff by adjusting
95-
[pyproject.toml](pyproject.toml).
95+
[pyproject.toml](https://github.com/aai-institute/pyDVL/blob/develop/pyproject.toml).
9696

9797
Besides the usual unit tests, most algorithms are tested using pytest. This
9898
requires ray for the parallelization and Memcached for caching. Please install
@@ -132,11 +132,11 @@ There are a few important arguments:
132132
of slow tests.
133133

134134
- `--with-cuda` sets the device fixture in [tests/influence/torch/conftest.py](
135-
tests/influence/torch/conftest.py) to `cuda` if it is available.
136-
Using this fixture within tests, you can run parts of your tests on a `cuda`
137-
device. Be aware, that you still have to take care of the usage of the device
138-
manually in a specific test. Setting this flag does not result in
139-
running all tests on a GPU.
135+
https://github.com/aai-institute/pyDVL/blob/develop/tests/influence/torch/conftest.py)
136+
to `cuda` if it is available. Using this fixture within tests, you can run parts
137+
of your tests on a `cuda` device. Be aware, that you still have to take care of
138+
the usage of the device manually in a specific test. Setting this flag does not
139+
result in running all tests on a GPU.
140140

141141
### Markers
142142

@@ -297,6 +297,33 @@ the environment variable `DYLD_FALLBACK_LIBRARY_PATH`:
297297
export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/opt/homebrew/lib
298298
```
299299

300+
### Automatic API documentation
301+
302+
We use [mkdocstrings](https://mkdocstrings.github.io/) to automatically generate
303+
API documentation from docstrings, following almost verbatim [this
304+
recipe](https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages):
305+
Stubs are generated for all modules on the fly using
306+
[generate_api_docs.py](https://github.com/aai-institute/pyDVL/blob/develop/build_scripts/generate_api_docs.py) thanks to the pluging
307+
[mkdocstrings-gen-files](https://github.com/oprypin/mkdocs-gen-files) and
308+
navigation is generated for
309+
[mkdocs-literate-nav](https://github.com/oprypin/mkdocs-literate-nav).
310+
311+
With some renaming and using
312+
[section-index](https://github.com/oprypin/mkdocs-section-index) `__init__.py`
313+
files are used as entry points for the documentation of a module.
314+
315+
Since very often we re-export symbols in the `__init__.py` files, the automatic
316+
generation of the documentation skips **all** symbols in those files. If you
317+
want to document any in particular you can do so by **overriding
318+
mkdocs_genfiles**: Create a file under `docs/api/pydvl/module/index.md` and add
319+
your documentation there. For example, to document the whole module and every
320+
(re-)exported symbol just add this to the file:
321+
322+
```markdown
323+
::: pydvl.module
324+
```
325+
326+
300327
### Adding new pages
301328

302329
Navigation is configured in `mkdocs.yaml` using the nav section. We use the
@@ -384,7 +411,8 @@ library](https://www.zotero.org/groups/2703043/transferlab/library). All other
384411
contributors just add the bibtex data, and a maintainer will add it to the group
385412
library upon merging.
386413

387-
To add a citation inside a markdown file, use the notation `[@citekey]`. Alas,
414+
To add a citation inside a markdown file, use the notation `[@ citekey]` (with
415+
no space). Alas,
388416
because of when mkdocs-bibtex enters the pipeline, it won't process docstrings.
389417
For module documentation, we manually inject html into the markdown files. For
390418
example, in `pydvl.value.shapley.montecarlo` we have:
@@ -440,7 +468,7 @@ use braces for legibility like in the first example.
440468
### Abbreviations
441469

442470
We keep the abbreviations used in the documentation inside the
443-
[docs_include/abbreviations.md](docs_includes%2Fabbreviations.md) file.
471+
[docs_include/abbreviations.md](https://github.com/aai-institute/pyDVL/blob/develop/docs_includes/abbreviations.md) file.
444472

445473
The syntax for abbreviations is:
446474

@@ -569,7 +597,7 @@ act -j lint
569597
act --artifact-server-path /tmp/artifacts
570598

571599
# Run a job in a specific workflow (useful if you have duplicate job names)
572-
act -j lint -W .github/workflows/tox.yml
600+
act -j lint -W .github/workflows/publish.yml
573601

574602
# Run in dry-run mode:
575603
act -n
@@ -727,9 +755,10 @@ PYPI_PASSWORD
727755
The first 2 are used after tests run on the develop branch's CI workflow
728756
to automatically publish packages to [TestPyPI](https://test.pypi.org/).
729757
730-
The last 2 are used in the [publish.yaml](.github/workflows/publish.yaml) CI
731-
workflow to publish packages to [PyPI](https://pypi.org/) from `develop` after
732-
a GitHub release.
758+
The last 2 are used in the
759+
[publish.yaml](https://github.com/aai-institute/pyDVL/blob/develop/.github/workflows/publish.yaml)
760+
CI workflow to publish packages to [PyPI](https://pypi.org/) from `develop`
761+
after a GitHub release.
733762
734763
#### Publish to TestPyPI
735764
@@ -738,6 +767,5 @@ the build part of the version number without commiting or tagging the change
738767
and then publish a package to TestPyPI from CI using Twine. The version
739768
has the GitHub run number appended.
740769
741-
For more details refer to the files
742-
[.github/workflows/publish.yaml](.github/workflows/publish.yaml) and
743-
[.github/workflows/tox.yaml](.github/workflows/tox.yaml).
770+
For more details refer to the file
771+
[.github/workflows/publish.yaml](https://github.com/aai-institute/pyDVL/blob/develop/.github/workflows/publish.yaml).

README.md

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -161,53 +161,55 @@ lazy_influences.to_zarr("influences_result", TorchNumpyConverter())
161161
The steps required to compute data values for your samples are:
162162

163163
1. Import the necessary packages (the exact ones will depend on your specific
164-
use case).
165-
2. Create a `Dataset` object with your train and test splits.
166-
3. Create an instance of a `SupervisedModel` (basically any sklearn compatible
167-
predictor), and wrap it in a `Utility` object together with the data and a
168-
scoring function.
169-
4. Use one of the methods defined in the library to compute the values. In the
170-
example below, we will use *Permutation Montecarlo Shapley*, an approximate
171-
method for computing Data Shapley values. The result is a variable of type
164+
use case, but most of the interface is exposed through `pydvl.valuation`).
165+
2. Create two `Dataset` objects with your train and test splits. There are
166+
some factories to do this from arrays or scikit-learn toy datasets.
167+
3. Create an instance of a `SupervisedScorer`, with any sklearn scorer and a
168+
"valuation set" over which your model will be scored.
169+
4. Wrap model and scorer in a `ModelUtility`.
170+
5. Use one of the methods defined in the library to compute the values. In the
171+
example below, we use the most basic *Montecarlo Shapley* with uniform
172+
sampling, an approximate method for computing Data Shapley values.
173+
6. Call `fit` in a joblib parallel context. The result is a variable of type
172174
`ValuationResult` that contains the indices and their values as well as other
173-
attributes.
174-
5. Convert the valuation result to a dataframe, and analyze and visualize the
175-
values.
175+
attributes. This object can be sliced, sorted and inspected directly, or you
176+
can convert it to a dataframe for convenience.
176177

177178
The higher the value for an index, the more important it is for the chosen
178179
model, dataset and scorer. Reciprocally, low-value points could be mislabelled,
179180
or out-of-distribution, and dropping them can improve the model's performance.
180181

181182
```python
182-
from sklearn.datasets import load_breast_cancer
183-
from sklearn.linear_model import LogisticRegression
184-
185-
from pydvl.utils import Dataset, Scorer, Utility
186-
from pydvl.value import (MaxUpdates, RelativeTruncation,
187-
permutation_montecarlo_shapley)
188-
189-
data = Dataset.from_sklearn(
190-
load_breast_cancer(),
191-
train_size=10,
192-
stratify_by_target=True,
193-
random_state=16,
194-
)
195-
model = LogisticRegression()
196-
u = Utility(
197-
model,
198-
data,
199-
Scorer("accuracy", default=0.0)
200-
)
201-
values = permutation_montecarlo_shapley(
202-
u,
203-
truncation=RelativeTruncation(u, 0.05),
204-
done=MaxUpdates(1000),
205-
seed=16,
206-
progress=True
207-
)
208-
df = values.to_dataframe(column="data_value")
183+
from joblib import parallel_config
184+
from sklearn.datasets import load_iris
185+
from sklearn.svm import SVC
186+
from pydvl.valuation import Dataset, ShapleyValuation, UniformSampler,\
187+
MinUpdates, ModelUtility, SupervisedScorer
188+
189+
seed = 42
190+
model = SVC(kernel="linear", probability=True, random_state=seed)
191+
192+
train, val = Dataset.from_sklearn(load_iris(), train_size=0.6, random_state=24)
193+
scorer = SupervisedScorer(model, val, default=0.0)
194+
utility = ModelUtility(model, scorer)
195+
sampler = UniformSampler(batch_size=2 ** 6, seed=seed)
196+
stopping = MinUpdates(1000)
197+
valuation = ShapleyValuation(utility, sampler, stopping, progress=True)
198+
199+
with parallel_config(n_jobs=32):
200+
valuation.fit(train)
201+
202+
result = valuation.values()
203+
df = result.to_dataframe(column="shapley")
209204
```
210205

206+
### Deprecation notice
207+
208+
Up until v0.9.2 valuation methods were available through the `pydvl.value`
209+
module, which is now deprecated in favour of the design showcased above,
210+
available under `pydvl.valuation`. The old module will be removed in a future
211+
release.
212+
211213
# Contributing
212214

213215
Please open new issues for bugs, feature requests and extensions. You can read

build_scripts/copy_changelog.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,21 @@
1414

1515
@mkdocs.plugins.event_priority(100)
1616
def on_pre_build(config):
17-
logger.info("Temporarily copying changelog to docs directory")
17+
logger.info("Link changelog to docs directory")
1818
try:
19-
if os.path.getmtime(changelog_file) <= os.path.getmtime(target_filepath):
20-
logger.info(
21-
f"Changelog '{os.fspath(changelog_file)}' hasn't been updated, skipping."
22-
)
23-
return
24-
except FileNotFoundError:
25-
pass
26-
logger.info(
27-
f"Creating symbolic link for '{os.fspath(changelog_file)}' "
28-
f"at '{os.fspath(target_filepath)}'"
29-
)
30-
target_filepath.symlink_to(changelog_file)
31-
32-
logger.info("Finished copying changelog to docs directory")
19+
target_filepath.symlink_to(changelog_file)
20+
logger.info(
21+
f"Created symbolic link for '{os.fspath(changelog_file)}' "
22+
f"at '{os.fspath(target_filepath)}'"
23+
)
24+
except FileExistsError:
25+
logger.info(
26+
f"File '{os.fspath(target_filepath)}' already exists, skipping symlink creation."
27+
)
3328

3429

3530
@mkdocs.plugins.event_priority(-100)
3631
def on_shutdown():
37-
logger.info("Removing temporary changelog in docs directory")
38-
target_filepath.unlink()
32+
pass # Removing the link on shutdown makes mike fail the build
33+
# logger.info("Removing temporary changelog in docs directory")
34+
# target_filepath.unlink()

0 commit comments

Comments
 (0)