aai-institute
diff --git a/‎.github/workflows/run-notebook-tests-workflow.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/run-notebook-tests-workflow.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.notebook_test_durations‎
Lines changed: 0 additions & 1 deletion b/‎.notebook_test_durations‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.test_durations‎
Lines changed: 6 additions & 6 deletions b/‎.test_durations‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 23 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 47 additions & 19 deletions b/‎CONTRIBUTING.md‎
Lines changed: 47 additions & 19 deletions
diff --git a/‎README.md‎
Lines changed: 40 additions & 38 deletions b/‎README.md‎
Lines changed: 40 additions & 38 deletions
diff --git a/‎build_scripts/copy_changelog.py‎
Lines changed: 13 additions & 17 deletions b/‎build_scripts/copy_changelog.py‎
Lines changed: 13 additions & 17 deletions
@@ -23,6 +23,11 @@ jobs:
   run-tests:
     runs-on: ubuntu-22.04
     steps:
+    - name: Free Disk Space (Ubuntu)
+      uses: jlumbroso/free-disk-space@main
+      with:
+        large-packages: false
+        docker-images: false
     - uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -5,7 +5,6 @@
     "notebooks/influence_synthetic.ipynb::": 7.191153166999996,
     "notebooks/influence_wine.ipynb::": 11.610076332999995,
     "notebooks/least_core_basic.ipynb::": 14.069404709000011,
-    "notebooks/least_core_basic_new.ipynb::": 24.492538208000013,
     "notebooks/msr_banzhaf_digits.ipynb::": 86.62082037599998,
     "notebooks/shapley_basic_spotify.ipynb::": 15.088616748999982,
     "notebooks/shapley_knn_flowers.ipynb::": 6.810235208000023,
 
@@ -1521,8 +1521,8 @@
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs1-10]": 0.0016590010000072652,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-100]": 0.0022294990000091275,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-10]": 0.003863207999984297,
-    "tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-100]": 0.001800666000065121,
-    "tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-10]": 0.0016530420000435697,
+    "tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-100]": 0.001800666000065121,
+    "tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-10]": 0.0016530420000435697,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-100]": 0.0018769589999578784,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-10]": 0.0016063749999375432,
     "tests/valuation/methods/test_semivalues.py::test_msr_banzhaf[5]": 9.342398666999998,
@@ -1636,10 +1636,10 @@
     "tests/valuation/scorers/test_classwise.py::test_classwise_scorer[test_data2-expected_scores2]": 0.0025690839999974457,
     "tests/valuation/scorers/test_scorers.py::test_compose_score": 0.0019082069999996065,
     "tests/valuation/scorers/test_scorers.py::test_scorer": 0.001976999999998341,
-    "tests/valuation/test_interface.py::test_data_banzhaf_valuation[1]": 0.0836418330000015,
-    "tests/valuation/test_interface.py::test_data_banzhaf_valuation[2]": 1.2780167490000025,
-    "tests/valuation/test_interface.py::test_data_beta_shapley_valuation[1]": 4.139234666999997,
-    "tests/valuation/test_interface.py::test_data_beta_shapley_valuation[2]": 3.603092916999998,
+    "tests/valuation/test_interface.py::test_banzhaf_valuation[1]": 0.0836418330000015,
+    "tests/valuation/test_interface.py::test_banzhaf_valuation[2]": 1.2780167490000025,
+    "tests/valuation/test_interface.py::test_beta_shapley_valuation[1]": 4.139234666999997,
+    "tests/valuation/test_interface.py::test_beta_shapley_valuation[2]": 3.603092916999998,
     "tests/valuation/test_interface.py::test_shapley_valuation[1]": 0.27120083299999465,
     "tests/valuation/test_interface.py::test_shapley_valuation[2]": 0.15037520699999618,
     "tests/valuation/test_interface.py::test_data_utility_learning[1]": 0.026216332999993597,
 
@@ -5,6 +5,13 @@
 
 ### Added
 
+- Simple memory monitor / reporting
+  [PR #663](https://github.com/aai-institute/pyDVL/pull/663)
+- New stopping criterion `MaxSamples`
+  [PR #661](https://github.com/aai-institute/pyDVL/pull/661)
+- Introduced `UtilityModel` and two implementations `IndicatorUtilityModel`
+  and `DeepSetsUtilityModel` for data utility learning
+  [PR #650](https://github.com/aai-institute/pyDVL/pull/650)
 - Introduced the concept of `ResultUpdater` in order to allow samplers to
   declare the proper strategy to use by valuations 
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -16,8 +23,9 @@
   [PR #636](https://github.com/aai-institute/pyDVL/pull/636)
 - Refactor Classwise Shapley valuation with the interfaces and sampler
   architecture [PR #616](https://github.com/aai-institute/pyDVL/pull/616)
-- Refactor KNN Shapley values with the new sampler architecture
+- Refactor KNN Shapley values with the new interface
   [PR #610](https://github.com/aai-institute/pyDVL/pull/610)
+  [PR #645](https://github.com/aai-institute/pyDVL/pull/645)
 - Refactor MSR Banzhaf semivalues with the new sampler architecture.
   [PR #605](https://github.com/aai-institute/pyDVL/pull/605)
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -52,9 +60,14 @@
 
 ### Fixed
 
+- Fixed `show_warnings=False` not being respected in subprocesses. Introduced
+  `suppress_warninigs` decorator for more flexibility
+  [PR #647](https://github.com/aai-institute/pyDVL/pull/647)
+  [PR #662](https://github.com/aai-institute/pyDVL/pull/662)
 - Fixed several bugs in diverse stopping criteria, including: iteration counts,
-  computing completion and resetting
+  computing completion, resetting, nested composition
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
+  [PR #650](https://github.com/aai-institute/pyDVL/pull/650)
 - Fixed all weights of all samplers to ensure that mix-and-matching samplers and
   semi-value methods always works, for all possible combinations
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -76,13 +89,20 @@
 
 ### Changed
 
+- Slicing, comparing and setting of `ValuationResult` behave in a more 
+  natural way
+  [PR #660](https://github.com/aai-institute/pyDVL/pull/660)
 - Switched all semi-value coefficients and sampler weights to log-space in
   order to avoid overflows
   [PR #643](https://github.com/aai-institute/pyDVL/pull/643)
 - Updated and rewrote some of the MSR banzhaf notebook
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
 - Updated Least-Core notebook
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
+- Updated Shapley spotify notebook
+  [PR #628](https://github.com/aai-institute/pyDVL/pull/628)
+- Updated Data Utility notebook
+  [PR #650](https://github.com/aai-institute/pyDVL/pull/650)
 - Restructured and generalized `StratifiedSampler` to allow using heuristics,
   thus subsuming Variance-Reduced stratified sampling into a unified framework.
   Implemented the heuristics proposed in that paper
@@ -93,6 +113,7 @@
   `GroupedDataset`, fixing inconsistencies in how the latter operates on indices.
   Also, both now return objects of the same type when slicing.
   [PR #631](https://github.com/aai-institute/pyDVL/pull/631)
+  [PR #648](https://github.com/aai-institute/pyDVL/pull/648)
 - Use tighter bounds for the calculation of the minimal sample size that guarantees
   an epsilon-delta approximation in group testing (Jia et al. 2023)
   [PR #602](https://github.com/aai-institute/pyDVL/pull/602)
 
@@ -15,10 +15,10 @@ If you are interested in setting up a similar project, consider the template
 
 ## Local development
 
-This project uses [black](https://github.com/psf/black) to format code and
+This project uses [ruff](https://github.com/astral-sh/ruff) to lint and format code and
 [pre-commit](https://pre-commit.com/) to invoke it as a git pre-commit hook.
-Consider installing any of [black's IDE
-integrations](https://black.readthedocs.io/en/stable/integrations/editors.html)
+Consider installing any of [ruff's IDE
+integrations](https://docs.astral.sh/ruff/editors/setup/)
 to make your life easier.
 
 Run the following to set up the pre-commit git hook to run before pushes:
@@ -83,7 +83,7 @@ If you use remote execution, don't forget to exclude data paths from deployment
 ## Testing
 
 Automated builds, tests, generation of documentation and publishing are handled
-by [CI pipelines](#CI). Before pushing your changes to the remote we recommend
+by [CI pipelines](#ci). Before pushing your changes to the remote we recommend
 to execute `tox` locally in order to detect mistakes early on and to avoid
 failing pipelines. tox will:
 * run the test suite
@@ -92,7 +92,7 @@ failing pipelines. tox will:
 * generate coverage reports in html, as well as badges.
 
 You can configure pytest, coverage and ruff by adjusting
-[pyproject.toml](pyproject.toml).
+[pyproject.toml](https://github.com/aai-institute/pyDVL/blob/develop/pyproject.toml).
 
 Besides the usual unit tests, most algorithms are tested using pytest. This
 requires ray for the parallelization and Memcached for caching. Please install
@@ -132,11 +132,11 @@ There are a few important arguments:
   of slow tests.
 
 - `--with-cuda` sets the device fixture in [tests/influence/torch/conftest.py](
-  tests/influence/torch/conftest.py) to `cuda` if it is available.
-  Using this fixture within tests, you can run parts of your tests on a `cuda` 
-  device. Be aware, that you still have to take care of the usage of the device
-  manually in a specific test. Setting this flag does not result in
-  running all tests on a GPU.
+  https://github.com/aai-institute/pyDVL/blob/develop/tests/influence/torch/conftest.py)
+  to `cuda` if it is available. Using this fixture within tests, you can run parts
+  of your tests on a `cuda` device. Be aware, that you still have to take care of
+  the usage of the device manually in a specific test. Setting this flag does not
+  result in running all tests on a GPU.
 
 ### Markers
 
@@ -297,6 +297,33 @@ the environment variable `DYLD_FALLBACK_LIBRARY_PATH`:
 export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/opt/homebrew/lib
 ```
 
+### Automatic API documentation
+
+We use [mkdocstrings](https://mkdocstrings.github.io/) to automatically generate
+API documentation from docstrings, following almost verbatim [this
+recipe](https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages):
+Stubs are generated for all modules on the fly using
+[generate_api_docs.py](https://github.com/aai-institute/pyDVL/blob/develop/build_scripts/generate_api_docs.py) thanks to the pluging
+[mkdocstrings-gen-files](https://github.com/oprypin/mkdocs-gen-files) and
+navigation is generated for
+[mkdocs-literate-nav](https://github.com/oprypin/mkdocs-literate-nav).
+
+With some renaming and using
+[section-index](https://github.com/oprypin/mkdocs-section-index) `__init__.py`
+files are used as entry points for the documentation of a module.
+
+Since very often we re-export symbols in the `__init__.py` files, the automatic
+generation of the documentation skips **all** symbols in those files. If you
+want to document any in particular you can do so by **overriding
+mkdocs_genfiles**: Create a file under `docs/api/pydvl/module/index.md` and add
+your documentation there. For example, to document the whole module and every
+(re-)exported symbol just add this to the file:
+
+```markdown
+::: pydvl.module
+```
+
+
 ### Adding new pages
 
 Navigation is configured in `mkdocs.yaml` using the nav section. We use the
@@ -384,7 +411,8 @@ library](https://www.zotero.org/groups/2703043/transferlab/library). All other
 contributors just add the bibtex data, and a maintainer will add it to the group
 library upon merging.
 
-To add a citation inside a markdown file, use the notation `[@citekey]`. Alas,
+To add a citation inside a markdown file, use the notation `[@ citekey]` (with
+no space). Alas,
 because of when mkdocs-bibtex enters the pipeline, it won't process docstrings.
 For module documentation, we manually inject html into the markdown files. For
 example, in `pydvl.value.shapley.montecarlo` we have:
@@ -440,7 +468,7 @@ use braces for legibility like in the first example.
 ### Abbreviations
 
 We keep the abbreviations used in the documentation inside the
-[docs_include/abbreviations.md](docs_includes%2Fabbreviations.md) file.
+[docs_include/abbreviations.md](https://github.com/aai-institute/pyDVL/blob/develop/docs_includes/abbreviations.md) file.
 
 The syntax for abbreviations is:
 
@@ -569,7 +597,7 @@ act -j lint
 act --artifact-server-path /tmp/artifacts
 
 # Run a job in a specific workflow (useful if you have duplicate job names)
-act -j lint -W .github/workflows/tox.yml
+act -j lint -W .github/workflows/publish.yml
 
 # Run in dry-run mode:
 act -n
@@ -727,9 +755,10 @@ PYPI_PASSWORD
 The first 2 are used after tests run on the develop branch's CI workflow 
 to automatically publish packages to [TestPyPI](https://test.pypi.org/).
 
-The last 2 are used in the [publish.yaml](.github/workflows/publish.yaml) CI
-workflow to publish packages to [PyPI](https://pypi.org/) from `develop` after
-a GitHub release.
+The last 2 are used in the
+[publish.yaml](https://github.com/aai-institute/pyDVL/blob/develop/.github/workflows/publish.yaml)
+CI workflow to publish packages to [PyPI](https://pypi.org/) from `develop`
+after a GitHub release.
 
 #### Publish to TestPyPI
 
@@ -738,6 +767,5 @@ the build part of the version number without commiting or tagging the change
 and then publish a package to TestPyPI from CI using Twine. The version
 has the GitHub run number appended. 
 
-For more details refer to the files
-[.github/workflows/publish.yaml](.github/workflows/publish.yaml) and
-[.github/workflows/tox.yaml](.github/workflows/tox.yaml).
+For more details refer to the file
+[.github/workflows/publish.yaml](https://github.com/aai-institute/pyDVL/blob/develop/.github/workflows/publish.yaml).
@@ -161,53 +161,55 @@ lazy_influences.to_zarr("influences_result", TorchNumpyConverter())
 The steps required to compute data values for your samples are:
 
 1. Import the necessary packages (the exact ones will depend on your specific
-   use case).
-2. Create a `Dataset` object with your train and test splits.
-3. Create an instance of a `SupervisedModel` (basically any sklearn compatible
-   predictor), and wrap it in a `Utility` object together with the data and a
-   scoring function.
-4. Use one of the methods defined in the library to compute the values. In the
-   example below, we will use *Permutation Montecarlo Shapley*, an approximate
-   method for computing Data Shapley values. The result is a variable of type
+   use case, but most of the interface is exposed through `pydvl.valuation`).
+2. Create two `Dataset` objects with your train and test splits. There are
+   some factories to do this from arrays or scikit-learn toy datasets.
+3. Create an instance of a `SupervisedScorer`, with any sklearn scorer and a
+   "valuation set" over which your model will be scored.
+4. Wrap model and scorer in a `ModelUtility`.
+5. Use one of the methods defined in the library to compute the values. In the
+   example below, we use the most basic *Montecarlo Shapley* with uniform
+   sampling, an approximate method for computing Data Shapley values.
+6. Call `fit` in a joblib parallel context. The result is a variable of type
    `ValuationResult` that contains the indices and their values as well as other
-   attributes.
-5. Convert the valuation result to a dataframe, and analyze and visualize the
-   values.
+   attributes. This object can be sliced, sorted and inspected directly, or you
+   can convert it to a dataframe for convenience.
 
 The higher the value for an index, the more important it is for the chosen
 model, dataset and scorer. Reciprocally, low-value points could be mislabelled,
 or out-of-distribution, and dropping them can improve the model's performance.
 
 ```python
-from sklearn.datasets import load_breast_cancer
-from sklearn.linear_model import LogisticRegression
-
-from pydvl.utils import Dataset, Scorer, Utility
-from pydvl.value import (MaxUpdates, RelativeTruncation,
-                         permutation_montecarlo_shapley)
-
-data = Dataset.from_sklearn(
-  load_breast_cancer(),
-  train_size=10,
-  stratify_by_target=True,
-  random_state=16,
-  )
-model = LogisticRegression()
-u = Utility(
-  model,
-  data,
-  Scorer("accuracy", default=0.0)
-  )
-values = permutation_montecarlo_shapley(
-  u,
-  truncation=RelativeTruncation(u, 0.05),
-  done=MaxUpdates(1000),
-  seed=16,
-  progress=True
-  )
-df = values.to_dataframe(column="data_value")
+from joblib import parallel_config
+from sklearn.datasets import load_iris
+from sklearn.svm import SVC
+from pydvl.valuation import Dataset, ShapleyValuation, UniformSampler,\ 
+    MinUpdates, ModelUtility, SupervisedScorer
+
+seed = 42
+model = SVC(kernel="linear", probability=True, random_state=seed)
+
+train, val = Dataset.from_sklearn(load_iris(), train_size=0.6, random_state=24)
+scorer = SupervisedScorer(model, val, default=0.0)
+utility = ModelUtility(model, scorer)
+sampler = UniformSampler(batch_size=2 ** 6, seed=seed)
+stopping = MinUpdates(1000)
+valuation = ShapleyValuation(utility, sampler, stopping, progress=True)
+
+with parallel_config(n_jobs=32):
+    valuation.fit(train)
+
+result = valuation.values()
+df = result.to_dataframe(column="shapley")
 ```
 
+### Deprecation notice
+
+Up until v0.9.2 valuation methods were available through the `pydvl.value`
+module, which is now deprecated in favour of the design showcased above,
+available under `pydvl.valuation`. The old module will be removed in a future
+release.
+
 # Contributing
 
 Please open new issues for bugs, feature requests and extensions. You can read
 
@@ -14,25 +14,21 @@
 
 @mkdocs.plugins.event_priority(100)
 def on_pre_build(config):
-    logger.info("Temporarily copying changelog to docs directory")
+    logger.info("Link changelog to docs directory")
     try:
-        if os.path.getmtime(changelog_file) <= os.path.getmtime(target_filepath):
-            logger.info(
-                f"Changelog '{os.fspath(changelog_file)}' hasn't been updated, skipping."
-            )
-            return
-    except FileNotFoundError:
-        pass
-    logger.info(
-        f"Creating symbolic link for '{os.fspath(changelog_file)}' "
-        f"at '{os.fspath(target_filepath)}'"
-    )
-    target_filepath.symlink_to(changelog_file)
-
-    logger.info("Finished copying changelog to docs directory")
+        target_filepath.symlink_to(changelog_file)
+        logger.info(
+            f"Created symbolic link for '{os.fspath(changelog_file)}' "
+            f"at '{os.fspath(target_filepath)}'"
+        )
+    except FileExistsError:
+        logger.info(
+            f"File '{os.fspath(target_filepath)}' already exists, skipping symlink creation."
+        )
 
 
 @mkdocs.plugins.event_priority(-100)
 def on_shutdown():
-    logger.info("Removing temporary changelog in docs directory")
-    target_filepath.unlink()
+    pass  # Removing the link on shutdown makes mike fail the build
+    # logger.info("Removing temporary changelog in docs directory")
+    # target_filepath.unlink()