diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4229f0fd..9061dba4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -11,22 +11,10 @@ defaults:
     shell: bash -el {0}
 
 jobs:
-  black:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@stable
-        with:
-          options: "--check"
-          src: "."
-          jupyter: true
-          version: "24.3"
-
   populate-cache:
     runs-on: ubuntu-latest
     outputs:
-      cache-key: ${{steps.cache-key.outputs.cache-key}}
-      cache-key-light: ${{steps.cache-key.outputs.cache-key}}-light
+      cache-key: ${{steps.cache-key.outputs.cache-key}}-0
     steps:
       - name: Get Date
         id: get-date
@@ -48,7 +36,7 @@ jobs:
         with:
           path: bioimageio_cache
           key: ${{steps.cache-key.outputs.cache-key}}
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         if: steps.look-up.outputs.cache-hit != 'true'
         with:
           python-version: '3.12'
@@ -72,112 +60,62 @@ jobs:
           - python-version: '3.9'
             conda-env: dev
             spec: conda
-          - python-version: '3.10'
+            numpy-version: 1
+          - python-version: '3.9'
             conda-env: dev
-            spec: conda
-          - python-version: '3.11'
-            conda-env: full
             spec: main
+            numpy-version: 2
+          - python-version: '3.10'
+            conda-env: full
             run-expensive-tests: true
             report-coverage: true
             save-cache: true
-          - python-version: '3.12'
-            conda-env: dev
             spec: conda
-          - python-version: '3.13'
+            numpy-version: 1
+          - python-version: '3.11'
             conda-env: dev
             spec: main
-            save-cache: true
+            numpy-version: 2
+          - python-version: '3.12'
+            conda-env: dev
+            spec: conda
+            numpy-version: 1
+          # - python-version: '3.13'
+          #   conda-env: '313'
+          #   spec: main
+          #   numpy-version: 2
 
     steps:
     - uses: actions/checkout@v4
-    - id: setup
-      run: |
-        echo "env-name=${{ matrix.spec }}-${{ matrix.conda-env }}-${{ matrix.python-version }}"
-        echo "env-name=${{ matrix.spec }}-${{ matrix.conda-env }}-${{ matrix.python-version }}" >> $GITHUB_OUTPUT
-        echo "env-file=dev/env-${{ matrix.conda-env }}.yaml"
-        echo "env-file=dev/env-${{ matrix.conda-env }}.yaml" >> $GITHUB_OUTPUT
-    - name: check on env-file
-      shell: python
-      run: |
-        from pathlib import Path
-        from pprint import pprint
-        if not (env_path:=Path("${{steps.setup.outputs.env-file}}")).exists():
-          if env_path.parent.exists():
-            pprint(env_path.parent.glob("*"))
-          else:
-            pprint(Path().glob("*"))
-          raise FileNotFoundError(f"{env_path} does not exist")
-
-    - uses: conda-incubator/setup-miniconda@v3
-      with:
-        auto-update-conda: true
-        auto-activate-base: true
-        activate-environment: ${{steps.setup.outputs.env-name}}
-        channel-priority: strict
-        miniforge-version: latest
-    - name: Get Date
-      id: get-date
-      run: |
-        echo "today=$(date -u '+%Y%m%d')"
-        echo "today=$(date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
-    - name: Restore cached env
-      uses: actions/cache/restore@v4
+    - uses: actions/setup-python@v6
       with:
-        path: ${{env.CONDA}}/envs/${{steps.setup.outputs.env-name}}
-        key: >-
-          conda-${{runner.os}}-${{runner.arch}}
-          -${{steps.get-date.outputs.today}}
-          -${{hashFiles(steps.setup.outputs.env-file)}}
-          -${{env.CACHE_NUMBER}}
-      env:
-        CACHE_NUMBER: 0
-      id: cache-env
-    - name: Install env
-      run: conda env update --name=${{steps.setup.outputs.env-name}} --file=${{steps.setup.outputs.env-file}} python=${{matrix.python-version}}
-      if: steps.cache-env.outputs.cache-hit != 'true'
-    - name: Install uncached pip dependencies
+        python-version: ${{matrix.python-version}}
+        cache: 'pip'
+    - name: Install dependencies
       run: |
         pip install --upgrade pip
-        pip install --no-deps -e .
-    - name: Install uncached pip dependencies for 'full' environment
-      if: matrix.conda-env == 'full'
-      run: |
-        pip install git+https://github.com/ChaoningZhang/MobileSAM.git
-    - name: Cache env
-      if: steps.cache-env.outputs.cache-hit != 'true'
-      uses: actions/cache/save@v4
-      with:
-        path: ${{env.CONDA}}/envs/${{steps.setup.outputs.env-name}}
-        key: >-
-          conda-${{runner.os}}-${{runner.arch}}
-          -${{steps.get-date.outputs.today}}
-          -${{hashFiles(steps.setup.outputs.env-file)}}
-          -${{env.CACHE_NUMBER}}
-      env:
-        CACHE_NUMBER: 0
-    - run: conda list
+        pip install -e .[dev] numpy==${{matrix.numpy-version}}.*
     - name: Pyright
       if: matrix.run-expensive-tests
       run: |
         pyright --version
         pyright -p pyproject.toml --pythonversion ${{ matrix.python-version }}
-    - name: Restore bioimageio cache ${{matrix.run-expensive-tests && needs.populate-cache.outputs.cache-key || needs.populate-cache.outputs.cache-key-light}}
+    - name: Restore bioimageio cache ${{needs.populate-cache.outputs.cache-key}}
       uses: actions/cache/restore@v4
       with:
         path: bioimageio_cache
-        key: ${{matrix.run-expensive-tests && needs.populate-cache.outputs.cache-key || needs.populate-cache.outputs.cache-key-light}}
+        key: ${{needs.populate-cache.outputs.cache-key}}
     - name: pytest
       run: pytest --cov bioimageio --cov-report xml --cov-append --capture no --disable-pytest-warnings
       env:
         BIOIMAGEIO_CACHE_PATH: bioimageio_cache
         RUN_EXPENSIVE_TESTS: ${{ matrix.run-expensive-tests && 'true' || 'false' }}
-    - name: Save bioimageio cache ${{matrix.run-expensive-tests && needs.populate-cache.outputs.cache-key || needs.populate-cache.outputs.cache-key-light}}
+    - name: Save bioimageio cache ${{needs.populate-cache.outputs.cache-key}}
       if: matrix.save-cache
       uses: actions/cache/save@v4
       with:
         path: bioimageio_cache
-        key: ${{matrix.run-expensive-tests && needs.populate-cache.outputs.cache-key || needs.populate-cache.outputs.cache-key-light}}
+        key: ${{needs.populate-cache.outputs.cache-key}}
 
     - if: matrix.report-coverage && github.event_name == 'pull_request'
       uses: orgoro/coverage@v3.2
@@ -223,7 +161,7 @@ jobs:
         shell: bash -l {0}
         run: |
           mkdir -p ./pkgs/noarch
-          conda-build -c conda-forge conda-recipe --no-test --output-folder ./pkgs
+          conda-build -c conda-forge conda-recipe --output-folder ./pkgs
 
   docs:
     needs: test
@@ -235,7 +173,7 @@ jobs:
         with:
           name: coverage
           path: dist
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         with:
           python-version: '3.12'
           cache: 'pip'
@@ -253,10 +191,9 @@ jobs:
           branch: gh-pages
           folder: dist
 
-  publish-pypi:
-    name: Publish to PyPI
+  pip-build:
+    name: Build with pip and publish to PyPI
     needs: test
-    if: github.ref == 'refs/heads/main'
     runs-on: ubuntu-latest
     steps:
       - name: Check out the repository
@@ -265,16 +202,18 @@ jobs:
           fetch-depth: 2
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
-      - name: Upgrade pip
+          python-version: '3.10'
+      - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip --version
-          pip install wheel .[tests]
-      - name: run tests
-        run: pytest --disable-pytest-warnings
+          pip install --upgrade build
+          pip install .
+      - name: Build package
+        run: |
+          python -m build
 
       - name: Check if there is a parent commit
         id: check-parent-commit
@@ -283,27 +222,23 @@ jobs:
 
       - name: Detect new version
         id: check-version
-        if: steps.check-parent-commit.outputs.sha
+        if: github.ref == 'refs/heads/main' && steps.check-parent-commit.outputs.sha
         uses: salsify/action-detect-and-tag-new-version@v2.0.3
         with:
           create-tag: false
           version-command: |
-            bash -o pipefail -c "cat bioimageio/core/VERSION | jq -r '.version'"
+            python -c "from pathlib import Path;p = p_src if (p_src:=Path('src/bioimageio/core/__init__.py')).exists() else Path('bioimageio/core/__init__.py');print(p.read_text().split('__version__ = \"')[1].split('\"')[0])"
 
       - name: Push tag
         id: tag-version
-        if: steps.check-version.outputs.previous-version != steps.check-version.outputs.current-version
+        if: github.ref == 'refs/heads/main' && steps.check-version.outputs.previous-version != steps.check-version.outputs.current-version
         uses: mathieudutour/github-tag-action@v5.5
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           custom_tag: ${{ steps.check-version.outputs.current-version }}
 
-      - name: Build package
-        run: |
-          python setup.py sdist bdist_wheel
-
       - name: Publish package on PyPI
-        if: steps.tag-version.outputs.new_tag
+        if: github.ref == 'refs/heads/main' && steps.tag-version.outputs.new_tag
         uses: pypa/gh-action-pypi-publish@release/v1.12
         with:
           user: __token__
@@ -311,6 +246,7 @@ jobs:
           packages-dir: dist/
           verbose: true
       - name: Publish the release notes
+        if: github.ref == 'refs/heads/main'
         uses: release-drafter/release-drafter@v6.0.0
         with:
           publish: "${{ steps.tag-version.outputs.new_tag != '' }}"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2bee435e..9544bc10 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
   - repo: https://github.com/ambv/black
-    rev: 24.2.0
+    rev: 25.1.0
     hooks:
       - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.12.8
     hooks:
-      - id: ruff
+      - id: ruff-check
         args: [--fix, --show-fixes]
   - repo: local
     hooks:
diff --git a/MANIFEST.in b/MANIFEST.in
index e1d35f13..04f196ac 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,2 @@
-include bioimageio/core/VERSION
 include README.md
 include LICENSE
diff --git a/README.md b/README.md
index 0359b953..82d93c94 100644
--- a/README.md
+++ b/README.md
@@ -364,7 +364,7 @@ may be controlled with the `LOGURU_LEVEL` environment variable.
 
 ## Changelog
 
-### next release
+### 0.9.1
 
 - fixes:
   - CLI
diff --git a/bioimageio/core/VERSION b/bioimageio/core/VERSION
deleted file mode 100644
index 2069fe23..00000000
--- a/bioimageio/core/VERSION
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "version": "0.9.0"
-}
diff --git a/bioimageio/core/__main__.py b/bioimageio/core/__main__.py
deleted file mode 100644
index 436448f5..00000000
--- a/bioimageio/core/__main__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .cli import Bioimageio
-
-
-def main():
-    cli = Bioimageio()  # pyright: ignore[reportCallIssue]
-    cli.run()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bioimageio/core/utils/__init__.py b/bioimageio/core/utils/__init__.py
deleted file mode 100644
index 695f0172..00000000
--- a/bioimageio/core/utils/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import json
-import sys
-from pathlib import Path
-
-from ._compare import compare as compare
-
-if sys.version_info < (3, 9):
-
-    def files(package_name: str):
-        assert package_name == "bioimageio.core"
-        return Path(__file__).parent.parent
-
-else:
-    from importlib.resources import files as files
-
-
-with files("bioimageio.core").joinpath("VERSION").open("r", encoding="utf-8") as f:
-    VERSION = json.load(f)["version"]
-    assert isinstance(VERSION, str)
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 3f63037b..4275802a 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -1,8 +1,13 @@
-{% set setup_py_data = load_setup_py_data() %}
+{% set pyproject = load_file_data('pyproject.toml') %}
+{% set version_match = load_file_regex(
+  load_file="src/bioimageio/core/__init__.py",
+  regex_pattern='__version__ = "(.+)"') %}
+{% set version = version_match[1] %}
+
 
 package:
-  name: bioimageio.core
-  version: {{ setup_py_data['version'] }}
+  name: {{ pyproject['project']['name'] }}
+  version: {{ version }}
 
 source:
   path: ..
@@ -11,52 +16,53 @@ build:
   noarch: python
   number: 0
   entry_points:
-    {% for ep in setup_py_data['entry_points']['console_scripts'] %}
-    - {{ ep }}
-    {% endfor %}
-  script: "{{ PYTHON }} -m pip install --no-deps --ignore-installed ."
+    bioimageio = {{ pyproject['project']['scripts']['bioimageio'] }}
+  script: python -m pip install --no-deps --ignore-installed .
 
 requirements:
   host:
-    - python >=3.9,<3.13
-    - pip
+    - python {{ pyproject['project']['requires-python'] }}
+    {% for dep in pyproject['build-system']['requires'] %}
+    - {{ dep.lower() }}
+    {% endfor %}
   run:
-    - python >=3.9,<3.13
-    {% for dep in setup_py_data['install_requires'] %}
+    - python {{ pyproject['project']['requires-python'] }}
+    {% for dep in pyproject['project']['dependencies'] %}
     - {{ dep.lower() }}
     {% endfor %}
   run_constrained:
     - cudatoolkit>=10.1
-    {% for dep in setup_py_data['extras_require']['pytorch'] %}
+    {% for dep in pyproject['project']['optional-dependencies']['pytorch']%}
       {% if dep.startswith('torch>=') %}  # pip: torch  -> conda: pytorch
     - py{{ dep.lower() }}
       {% else %}
     - {{ dep.lower() }}
       {% endif %}
     {% endfor %}
-    {% for dep in setup_py_data['extras_require']['onnx'] %}
+    {% for dep in pyproject['project']['optional-dependencies']['onnx'] %}
     - {{ dep.lower() }}
     {% endfor %}
-    {% for dep in setup_py_data['extras_require']['tensorflow'] %}
+    {% for dep in pyproject['project']['optional-dependencies']['tensorflow'] %}
     - {{ dep.lower() }}
     {% endfor %}
 
 
 test:
   imports:
-    - bioimageio.core
+    - {{ pyproject['project']['name'] }}
   source_files:
     - tests
   requires:
-    {% for dep in setup_py_data['extras_require']['dev'] %}
+    {% for dep in pyproject['project']['optional-dependencies']['dev'] %}
+    - {{ dep.replace('torch', 'pytorch').lower().replace('_', '-') }}
       {% if dep.startswith('torch>=') %}  # pip: torch  -> conda: pytorch
     - py{{ dep.lower() }}
       {% else %}
-    - {{ dep.lower() }}
+    - {{ dep.lower().replace('_', '-') }}
       {% endif %}
     {% endfor %}
   commands:
-    - pytest
+    - pytest --capture=no
 
 
 about:
diff --git a/dev/env-dev.yaml b/dev/env-dev.yaml
deleted file mode 100644
index 38cbb289..00000000
--- a/dev/env-dev.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# modified copy of env-full.yaml wo dependencies 'for model testing'
-name: core
-channels:
-  - conda-forge
-  - nodefaults
-  - pytorch
-dependencies:
-  - bioimageio.spec==0.5.4.3
-  - black
-  # - crick  # currently requires python<=3.9
-  - h5py
-  - httpx
-  - imagecodecs
-  - imageio>=2.5
-  - jupyter
-  - jupyter-black
-  - keras>=3.0,<4
-  - loguru
-  - matplotlib
-  - napari
-  - numpy
-  - onnx
-  - onnxruntime
-  - packaging>=17.0
-  - pdoc
-  - pip
-  - pre-commit
-  - psutil
-  - pydantic
-  - pydantic-settings
-  - pyright
-  - pytest
-  - pytest-cov
-  # - python=3.11  # removed
-  - pytorch>=2.1,<3
-  - respx
-  - rich
-  - ruff
-  - ruyaml
-  - tensorflow>=2,<3
-  - torchvision
-  - tqdm
-  - typing-extensions
-  - xarray>=2024.01,<2025.3.0
-  - pip:
-      - -e ..
diff --git a/dev/env-full.yaml b/dev/env-full.yaml
deleted file mode 100644
index 993a22c5..00000000
--- a/dev/env-full.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: core-full
-channels:
-  - conda-forge
-  - nodefaults
-  - pytorch
-dependencies:
-  - bioimageio.spec==0.5.4.3
-  - black
-  - cellpose # for model testing
-  # - crick  # currently requires python<=3.9
-  - h5py
-  - imagecodecs
-  - imageio>=2.5
-  - jupyter
-  - jupyter-black
-  - keras>=3.0,<4
-  - loguru
-  - matplotlib
-  - monai # for model testing
-  - napari
-  - numpy
-  - onnx
-  - onnxruntime
-  - packaging>=17.0
-  - pdoc
-  - pip
-  - pre-commit
-  - psutil
-  - pydantic
-  - pydantic-settings
-  - pyright
-  - pytest
-  - pytest-cov
-  - python=3.11 # 3.12 not supported by cellpose->fastremap
-  - pytorch>=2.1,<3
-  - httpx
-  - respx
-  - rich
-  - ruff
-  - ruyaml
-  - segment-anything # for model testing
-  - tensorflow>=2,<3
-  - timm # for model testing
-  - torchvision>=0.21
-  - tqdm
-  - typing-extensions
-  - xarray>=2024.01,<2025.3.0
-  - pip:
-      # - careamics # for model testing (TODO: install without exact bioimageio.core pin)
-      - git+https://github.com/ChaoningZhang/MobileSAM.git # for model testing
-      - -e ..
diff --git a/dev/env-gpu.yaml b/dev/env-gpu.yaml
deleted file mode 100644
index 25229679..00000000
--- a/dev/env-gpu.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# version of enf-full for running on GPU
-name: core-gpu
-channels:
-  - conda-forge
-  - nodefaults
-dependencies:
-  - bioimageio.spec==0.5.4.3
-  - black
-  - cellpose # for model testing
-  # - crick  # currently requires python<=3.9
-  - h5py
-  - imagecodecs
-  - imageio>=2.5
-  - jupyter
-  - jupyter-black
-  - keras>=3.0,<4
-  - loguru
-  - matplotlib
-  - monai # for model testing
-  - numpy
-  - onnx
-  - packaging>=17.0
-  - pdoc
-  - pip
-  - pre-commit
-  - psutil
-  - pydantic<2.9
-  - pydantic-settings
-  - pyright
-  - pytest
-  - pytest-cov
-  - python=3.11
-  - httpx
-  - respx
-  - rich
-  - ruff
-  - ruyaml
-  - segment-anything # for model testing
-  - timm # for model testing
-  - tqdm
-  - typing-extensions
-  - xarray>=2024.01,<2025.3.0
-  - pip:
-      # - tf2onnx # TODO: add tf2onnx
-      - --extra-index-url https://download.pytorch.org/whl/cu126
-      # - careamics # for model testing (TODO: install without exact bioimageio.core pin)
-      - git+https://github.com/ChaoningZhang/MobileSAM.git # for model testing
-      - onnxruntime-gpu
-      - tensorflow
-      - torch
-      - torchaudio
-      - torchvision>=0.21
-      - -e ..
diff --git a/dev/env-py38.yaml b/dev/env-py38.yaml
deleted file mode 100644
index d280bbd5..00000000
--- a/dev/env-py38.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# DEPRECATED
-# manipulated copy of env-full.yaml wo dependencies 'for model testing' for python 3.8
-name: core-py38
-channels:
-  - conda-forge
-  - nodefaults
-  - pytorch
-dependencies:
-  - bioimageio.spec==0.5.4.3
-  - black
-  - crick # uncommented
-  - h5py
-  - imagecodecs
-  - imageio>=2.5
-  - jupyter
-  - jupyter-black
-  # - keras>=3.0,<4 # removed
-  - loguru
-  - matplotlib
-  - numpy
-  - onnx
-  - onnxruntime
-  - packaging>=17.0
-  - pdoc
-  - pip
-  - pre-commit
-  - psutil
-  - pydantic
-  - pydantic-settings
-  - pyright
-  - pytest
-  - pytest-cov
-  - python=3.8 # changed
-  - pytorch>=2.1,<3
-  - httpx
-  - respx
-  - rich
-  - ruff
-  - ruyaml
-  # - tensorflow>=2,<3  removed
-  - torchvision
-  - tqdm
-  - typing-extensions
-  - xarray>=2023.01,<2025.3.0
-  - pip:
-      - -e ..
diff --git a/pyproject.toml b/pyproject.toml
index db264984..059d8e9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,88 @@
+[project]
+name = "bioimageio.core"
+description = "Python specific core utilities for bioimage.io resources (in particular DL models)."
+authors = [{ name = "Fynn Beuttenmüller", email = "thefynnbe@gmail.com" }]
+requires-python = ">=3.9"
+readme = "README.md"
+dynamic = ["version"]
+dependencies = [
+    "bioimageio.spec ==0.5.5.4",
+    "h5py",
+    "imagecodecs",
+    "imageio>=2.10",
+    "loguru",
+    "numpy",
+    "pydantic-settings>=2.5,<3",
+    "pydantic>=2.7.0,<3",
+    "ruyaml",
+    "scipy",
+    "tqdm",
+    "typing-extensions",
+    "xarray>=2023.01,<2025.3.0",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+[project.scripts]
+bioimageio = "bioimageio.core.__main__:main"
+
+[project.urls]
+"Bug Reports" = "https://github.com/bioimage-io/core-bioimage-io-python/issues"
+Changelog = "https://github.com/bioimage-io/core-bioimage-io-python?tab=readme-ov-file#changelog"
+Documentation = "https://bioimage-io.github.io/core-bioimage-io-python/bioimageio/core.html"
+Source = "https://github.com/bioimage-io/core-bioimage-io-python"
+
+[project.optional-dependencies]
+onnx = ["onnxruntime"]
+pytorch = ["torch>=1.6,<3", "torchvision>=0.21", "keras>=3.0,<4"]
+tensorflow = ["tensorflow", "keras>=2.15,<4"]
+dev = [
+    "black",
+    "cellpose",          # for model testing
+    "crick",
+    "httpx",
+    "jupyter-black",
+    "jupyter",
+    "keras>=3.0,<4",
+    "matplotlib",
+    "monai",             # for model testing
+    "onnx",
+    "onnxruntime",
+    "packaging>=17.0",
+    "pdoc",
+    "pre-commit",
+    "pyright==1.1.404",
+    "pytest-cov",
+    "pytest",
+    "python-dotenv",
+    "segment-anything",  # for model testing
+    "tensorflow",
+    "timm",              # for model testing
+    "torch>=1.6,<3",
+    "torchvision>=0.21",
+]
+
+[build-system]
+requires = ["pip", "setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src/"]
+
+[tool.setuptools.dynamic]
+version = { attr = "bioimageio.core.__version__" }
+
 [tool.black]
 line-length = 88
 extend-exclude = "/presentations/"
-target-version = ["py38", "py39", "py310", "py311", "py312"]
+target-version = ["py39", "py310", "py311", "py312"]
 preview = true
 
 [tool.pyright]
@@ -14,9 +95,9 @@ exclude = [
     "scripts/pdoc/patched.py",
     "tests/old_*",
 ]
-include = ["bioimageio", "scripts", "tests"]
+include = ["src", "scripts", "tests"]
 pythonPlatform = "All"
-pythonVersion = "3.12"
+pythonVersion = "3.9"
 reportDuplicateImport = "error"
 reportImplicitStringConcatenation = "error"
 reportIncompatibleMethodOverride = true
@@ -40,12 +121,12 @@ typeCheckingMode = "strict"
 useLibraryCodeForTypes = true
 
 [tool.pytest.ini_options]
-addopts = "--doctest-modules --failed-first --ignore dogfood --ignore bioimageio/core/backends --ignore bioimageio/core/weight_converters"
-testpaths = ["bioimageio/core", "tests"]
+addopts = "--doctest-modules --failed-first --ignore dogfood --ignore src/bioimageio/core/backends --ignore src/bioimageio/core/weight_converters"
+testpaths = ["src", "tests"]
 
 [tool.ruff]
 line-length = 88
-target-version = "py312"
+target-version = "py39"
 include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
 exclude = [
     "presentations",
@@ -53,5 +134,8 @@ exclude = [
     "scripts/pdoc/patched.py",
 ]
 
+[tool.ruff.lint]
+select = ["NPY201"]
+
 [tool.coverage.report]
 exclude_also = ["if TYPE_CHECKING:", "assert_never\\("]
diff --git a/scripts/show_diff.py b/scripts/show_diff.py
index 3e273d79..d1f4affe 100644
--- a/scripts/show_diff.py
+++ b/scripts/show_diff.py
@@ -2,14 +2,10 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import pooch  # pyright: ignore[reportMissingTypeStubs]
-
 from bioimageio.core import load_description, save_bioimageio_yaml_only
 
 if __name__ == "__main__":
     rdf_source = "https://raw.githubusercontent.com/bioimage-io/spec-bioimage-io/main/example_descriptions/models/unet2d_nuclei_broad/v0_4_9.bioimageio.yaml"
-
-    local_source = Path(pooch.retrieve(rdf_source, None))
     model_as_is = load_description(rdf_source, format_version="discover")
     model_latest = load_description(rdf_source, format_version="latest")
     print(model_latest.validation_summary)
diff --git a/setup.py b/setup.py
deleted file mode 100644
index ff933b96..00000000
--- a/setup.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import json
-from pathlib import Path
-
-from setuptools import find_namespace_packages, setup
-
-# Get the long description from the README file
-ROOT_DIR = Path(__file__).parent.resolve()
-long_description = (ROOT_DIR / "README.md").read_text(encoding="utf-8")
-VERSION_FILE = ROOT_DIR / "bioimageio" / "core" / "VERSION"
-VERSION = json.loads(VERSION_FILE.read_text())["version"]
-
-
-_ = setup(
-    name="bioimageio.core",
-    version=VERSION,
-    description="Python functionality for the bioimage model zoo",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/bioimage-io/core-bioimage-io-python",
-    author="Bioimage Team",
-    classifiers=[
-        "Development Status :: 3 - Alpha",
-        "Intended Audience :: Developers",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "Programming Language :: Python :: 3.13",
-    ],
-    packages=find_namespace_packages(exclude=["tests"]),
-    install_requires=[
-        "bioimageio.spec ==0.5.4.3",
-        "h5py",
-        "imagecodecs",
-        "imageio>=2.10",
-        "loguru",
-        "numpy",
-        "pydantic-settings>=2.5,<3",
-        "pydantic>=2.7.0,<3",
-        "ruyaml",
-        "tqdm",
-        "typing-extensions",
-        "xarray>=2023.01,<2025.3.0",
-    ],
-    include_package_data=True,
-    extras_require={
-        "pytorch": (
-            pytorch_deps := ["torch>=1.6,<3", "torchvision>=0.21", "keras>=3.0,<4"]
-        ),
-        "tensorflow": ["tensorflow", "keras>=2.15,<4"],
-        "onnx": ["onnxruntime"],
-        "tests": (  # minimal test requirements
-            test_deps := ["pytest", "pytest-cov", "python-dotenv"]
-        ),
-        "dev": (
-            test_deps
-            + pytorch_deps
-            + [
-                "black",
-                "cellpose",  # for model testing
-                "httpx",
-                "jupyter-black",
-                "jupyter",
-                "matplotlib",
-                "monai",  # for model testing
-                "onnx",
-                "onnxruntime",
-                "packaging>=17.0",
-                "pdoc",
-                "pre-commit",
-                "pyright==1.1.403",
-                "segment-anything",  # for model testing
-                "timm",  # for model testing
-                # "crick",  # currently requires python<=3.9
-            ]
-        ),
-    },
-    project_urls={
-        "Bug Reports": "https://github.com/bioimage-io/core-bioimage-io-python/issues",
-        "Source": "https://github.com/bioimage-io/core-bioimage-io-python",
-    },
-    entry_points={"console_scripts": ["bioimageio = bioimageio.core.__main__:main"]},
-)
diff --git a/bioimageio/core/__init__.py b/src/bioimageio/core/__init__.py
similarity index 96%
rename from bioimageio/core/__init__.py
rename to src/bioimageio/core/__init__.py
index d37be4d4..56f15e98 100644
--- a/bioimageio/core/__init__.py
+++ b/src/bioimageio/core/__init__.py
@@ -1,6 +1,12 @@
 """
 .. include:: ../../README.md
 """
+# ruff: noqa: E402
+
+__version__ = "0.9.1"
+from loguru import logger
+
+logger.disable("bioimageio.core")
 
 from bioimageio.spec import (
     ValidationSummary,
@@ -50,12 +56,8 @@
 from .stat_calculators import compute_dataset_measures
 from .stat_measures import Stat
 from .tensor import Tensor
-from .utils import VERSION
 from .weight_converters import add_weights
 
-__version__ = VERSION
-
-
 # aliases
 test_resource = test_description
 """alias of `test_description`"""
diff --git a/src/bioimageio/core/__main__.py b/src/bioimageio/core/__main__.py
new file mode 100644
index 00000000..ed7c3280
--- /dev/null
+++ b/src/bioimageio/core/__main__.py
@@ -0,0 +1,25 @@
+import sys
+
+from loguru import logger
+
+logger.enable("bioimageio")
+
+logger.remove()
+_ = logger.add(
+    sys.stderr,
+    level="INFO",
+    format="<green>{elapsed:}</green> | "
+    + "<level>{level: <8}</level> | "
+    + "<cyan>{module}</cyan> - <level>{message}</level>",
+)
+
+from .cli import Bioimageio
+
+
+def main():
+    cli = Bioimageio()  # pyright: ignore[reportCallIssue]
+    cli.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bioimageio/core/_magic_tensor_ops.py b/src/bioimageio/core/_magic_tensor_ops.py
similarity index 100%
rename from bioimageio/core/_magic_tensor_ops.py
rename to src/bioimageio/core/_magic_tensor_ops.py
diff --git a/bioimageio/core/_op_base.py b/src/bioimageio/core/_op_base.py
similarity index 100%
rename from bioimageio/core/_op_base.py
rename to src/bioimageio/core/_op_base.py
diff --git a/bioimageio/core/_prediction_pipeline.py b/src/bioimageio/core/_prediction_pipeline.py
similarity index 100%
rename from bioimageio/core/_prediction_pipeline.py
rename to src/bioimageio/core/_prediction_pipeline.py
diff --git a/bioimageio/core/_resource_tests.py b/src/bioimageio/core/_resource_tests.py
similarity index 87%
rename from bioimageio/core/_resource_tests.py
rename to src/bioimageio/core/_resource_tests.py
index 97b62bd3..3c5bb8a0 100644
--- a/bioimageio/core/_resource_tests.py
+++ b/src/bioimageio/core/_resource_tests.py
@@ -2,12 +2,14 @@
 import os
 import platform
 import subprocess
+import sys
 import warnings
 from io import StringIO
 from itertools import product
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import (
+    Any,
     Callable,
     Dict,
     Hashable,
@@ -21,14 +23,20 @@
     overload,
 )
 
-import xarray as xr
+import numpy as np
 from loguru import logger
+from numpy.typing import NDArray
 from typing_extensions import NotRequired, TypedDict, Unpack, assert_never, get_args
 
+from bioimageio.core import __version__
 from bioimageio.spec import (
+    AnyDatasetDescr,
+    AnyModelDescr,
     BioimageioCondaEnv,
+    DatasetDescr,
     InvalidDescr,
     LatestResourceDescr,
+    ModelDescr,
     ResourceDescr,
     ValidationContext,
     build_description,
@@ -62,9 +70,8 @@
 from ._prediction_pipeline import create_prediction_pipeline
 from .axis import AxisId, BatchSize
 from .common import MemberId, SupportedWeightsFormat
-from .digest_spec import get_test_inputs, get_test_outputs
+from .digest_spec import get_test_input_sample, get_test_output_sample
 from .sample import Sample
-from .utils import VERSION
 
 
 class DeprecatedKwargs(TypedDict):
@@ -165,7 +172,7 @@ def test_model(
     *,
     determinism: Literal["seed_only", "full"] = "seed_only",
     sha256: Optional[Sha256] = None,
-    stop_early: bool = False,
+    stop_early: bool = True,
     **deprecated: Unpack[DeprecatedKwargs],
 ) -> ValidationSummary:
     """Test model inference"""
@@ -195,7 +202,7 @@ def test_description(
     determinism: Literal["seed_only", "full"] = "seed_only",
     expected_type: Optional[str] = None,
     sha256: Optional[Sha256] = None,
-    stop_early: bool = False,
+    stop_early: bool = True,
     runtime_env: Union[
         Literal["currently-active", "as-described"], Path, BioimageioCondaEnv
     ] = ("currently-active"),
@@ -249,7 +256,10 @@ def test_description(
     else:
         assert_never(runtime_env)
 
-    with TemporaryDirectory(ignore_cleanup_errors=True) as _d:
+    td_kwargs: Dict[str, Any] = (
+        dict(ignore_cleanup_errors=True) if sys.version_info >= (3, 10) else {}
+    )
+    with TemporaryDirectory(**td_kwargs) as _d:
         working_dir = Path(_d)
         if isinstance(source, (dict, ResourceDescrBase)):
             file_source = save_bioimageio_package(
@@ -461,6 +471,36 @@ def _test_in_env(
     return ValidationSummary.load_json(summary_path)
 
 
+@overload
+def load_description_and_test(
+    source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
+    *,
+    format_version: Literal["latest"],
+    weight_format: Optional[SupportedWeightsFormat] = None,
+    devices: Optional[Sequence[str]] = None,
+    determinism: Literal["seed_only", "full"] = "seed_only",
+    expected_type: Literal["model"],
+    sha256: Optional[Sha256] = None,
+    stop_early: bool = True,
+    **deprecated: Unpack[DeprecatedKwargs],
+) -> Union[ModelDescr, InvalidDescr]: ...
+
+
+@overload
+def load_description_and_test(
+    source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
+    *,
+    format_version: Literal["latest"],
+    weight_format: Optional[SupportedWeightsFormat] = None,
+    devices: Optional[Sequence[str]] = None,
+    determinism: Literal["seed_only", "full"] = "seed_only",
+    expected_type: Literal["dataset"],
+    sha256: Optional[Sha256] = None,
+    stop_early: bool = True,
+    **deprecated: Unpack[DeprecatedKwargs],
+) -> Union[DatasetDescr, InvalidDescr]: ...
+
+
 @overload
 def load_description_and_test(
     source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
@@ -471,11 +511,41 @@ def load_description_and_test(
     determinism: Literal["seed_only", "full"] = "seed_only",
     expected_type: Optional[str] = None,
     sha256: Optional[Sha256] = None,
-    stop_early: bool = False,
+    stop_early: bool = True,
     **deprecated: Unpack[DeprecatedKwargs],
 ) -> Union[LatestResourceDescr, InvalidDescr]: ...
 
 
+@overload
+def load_description_and_test(
+    source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
+    *,
+    format_version: Union[FormatVersionPlaceholder, str] = DISCOVER,
+    weight_format: Optional[SupportedWeightsFormat] = None,
+    devices: Optional[Sequence[str]] = None,
+    determinism: Literal["seed_only", "full"] = "seed_only",
+    expected_type: Literal["model"],
+    sha256: Optional[Sha256] = None,
+    stop_early: bool = True,
+    **deprecated: Unpack[DeprecatedKwargs],
+) -> Union[AnyModelDescr, InvalidDescr]: ...
+
+
+@overload
+def load_description_and_test(
+    source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
+    *,
+    format_version: Union[FormatVersionPlaceholder, str] = DISCOVER,
+    weight_format: Optional[SupportedWeightsFormat] = None,
+    devices: Optional[Sequence[str]] = None,
+    determinism: Literal["seed_only", "full"] = "seed_only",
+    expected_type: Literal["dataset"],
+    sha256: Optional[Sha256] = None,
+    stop_early: bool = True,
+    **deprecated: Unpack[DeprecatedKwargs],
+) -> Union[AnyDatasetDescr, InvalidDescr]: ...
+
+
 @overload
 def load_description_and_test(
     source: Union[ResourceDescr, PermissiveFileSource, BioimageioYamlContent],
@@ -486,7 +556,7 @@ def load_description_and_test(
     determinism: Literal["seed_only", "full"] = "seed_only",
     expected_type: Optional[str] = None,
     sha256: Optional[Sha256] = None,
-    stop_early: bool = False,
+    stop_early: bool = True,
     **deprecated: Unpack[DeprecatedKwargs],
 ) -> Union[ResourceDescr, InvalidDescr]: ...
 
@@ -500,7 +570,7 @@ def load_description_and_test(
     determinism: Literal["seed_only", "full"] = "seed_only",
     expected_type: Optional[str] = None,
     sha256: Optional[Sha256] = None,
-    stop_early: bool = False,
+    stop_early: bool = True,
     **deprecated: Unpack[DeprecatedKwargs],
 ) -> Union[ResourceDescr, InvalidDescr]:
     """Test a bioimage.io resource dynamically,
@@ -557,7 +627,7 @@ def load_description_and_test(
         )
 
     rd.validation_summary.env.add(
-        InstalledPackage(name="bioimageio.core", version=VERSION)
+        InstalledPackage(name="bioimageio.core", version=__version__)
     )
 
     if expected_type is not None:
@@ -678,13 +748,13 @@ def add_warning_entry(msg: str):
         )
 
     try:
-        inputs = get_test_inputs(model)
-        expected = get_test_outputs(model)
+        test_input = get_test_input_sample(model)
+        expected = get_test_output_sample(model)
 
         with create_prediction_pipeline(
             bioimageio_model=model, devices=devices, weight_format=weight_format
         ) as prediction_pipeline:
-            results = prediction_pipeline.predict_sample_without_blocking(inputs)
+            results = prediction_pipeline.predict_sample_without_blocking(test_input)
 
         if len(results.members) != len(expected.members):
             add_error_entry(
@@ -701,45 +771,71 @@ def add_warning_entry(msg: str):
                     else:
                         continue
 
+                if actual.dims != (dims := expected.dims):
+                    add_error_entry(
+                        f"Output '{m}' has dims {actual.dims}, but expected {expected.dims}"
+                    )
+                    if stop_early:
+                        break
+                    else:
+                        continue
+
+                if actual.tagged_shape != expected.tagged_shape:
+                    add_error_entry(
+                        f"Output '{m}' has shape {actual.tagged_shape}, but expected {expected.tagged_shape}"
+                    )
+                    if stop_early:
+                        break
+                    else:
+                        continue
+
+                expected_np = expected.data.to_numpy().astype(np.float32)
+                del expected
+                actual_np: NDArray[Any] = actual.data.to_numpy().astype(np.float32)
+                del actual
+
                 rtol, atol, mismatched_tol = _get_tolerance(
                     model, wf=weight_format, m=m, **deprecated
                 )
-                rtol_value = rtol * abs(expected)
-                abs_diff = abs(actual - expected)
+                rtol_value = rtol * abs(expected_np)
+                abs_diff = abs(actual_np - expected_np)
                 mismatched = abs_diff > atol + rtol_value
                 mismatched_elements = mismatched.sum().item()
                 if not mismatched_elements:
                     continue
 
-                mismatched_ppm = mismatched_elements / expected.size * 1e6
+                mismatched_ppm = mismatched_elements / expected_np.size * 1e6
                 abs_diff[~mismatched] = 0  # ignore non-mismatched elements
 
-                r_max_idx = (r_diff := (abs_diff / (abs(expected) + 1e-6))).argmax()
+                r_max_idx_flat = (
+                    r_diff := (abs_diff / (abs(expected_np) + 1e-6))
+                ).argmax()
+                r_max_idx = np.unravel_index(r_max_idx_flat, r_diff.shape)
                 r_max = r_diff[r_max_idx].item()
-                r_actual = actual[r_max_idx].item()
-                r_expected = expected[r_max_idx].item()
+                r_actual = actual_np[r_max_idx].item()
+                r_expected = expected_np[r_max_idx].item()
 
                 # Calculate the max absolute difference with the relative tolerance subtracted
-                abs_diff_wo_rtol: xr.DataArray = xr.ufuncs.maximum(
-                    (abs_diff - rtol_value).data, 0
+                abs_diff_wo_rtol: NDArray[np.float32] = (abs_diff - rtol_value).max(
+                    initial=0
+                )
+                a_max_idx = np.unravel_index(
+                    abs_diff_wo_rtol.argmax(), abs_diff_wo_rtol.shape
                 )
-                a_max_idx = {
-                    AxisId(k): int(v) for k, v in abs_diff_wo_rtol.argmax().items()
-                }
 
                 a_max = abs_diff[a_max_idx].item()
-                a_actual = actual[a_max_idx].item()
-                a_expected = expected[a_max_idx].item()
+                a_actual = actual_np[a_max_idx].item()
+                a_expected = expected_np[a_max_idx].item()
 
                 msg = (
                     f"Output '{m}' disagrees with {mismatched_elements} of"
-                    + f" {expected.size} expected values"
+                    + f" {expected_np.size} expected values"
                     + f" ({mismatched_ppm:.1f} ppm)."
                     + f"\n Max relative difference: {r_max:.2e}"
                     + rf" (= \|{r_actual:.2e} - {r_expected:.2e}\|/\|{r_expected:.2e} + 1e-6\|)"
-                    + f" at {r_max_idx}"
+                    + f" at {dict(zip(dims, r_max_idx))}"
                     + f"\n Max absolute difference not accounted for by relative tolerance: {a_max:.2e}"
-                    + rf" (= \|{a_actual:.7e} - {a_expected:.7e}\|) at {a_max_idx}"
+                    + rf" (= \|{a_actual:.7e} - {a_expected:.7e}\|) at {dict(zip(dims, a_max_idx))}"
                 )
                 if mismatched_ppm > mismatched_tol:
                     add_error_entry(msg)
@@ -802,7 +898,8 @@ def _test_model_inference_parametrized(
         (b, n) for b, n in product(sorted(batch_sizes), sorted(ns))
     }
     logger.info(
-        "Testing inference with {} different inputs (B, N): {}",
+        "Testing inference with '{}' for {} different inputs (B, N): {}",
+        weight_format,
         len(test_cases),
         test_cases,
     )
@@ -833,7 +930,7 @@ def get_ns(n: int):
             resized_test_inputs = Sample(
                 members={
                     t.id: (
-                        test_inputs.members[t.id].resize_to(
+                        test_input.members[t.id].resize_to(
                             {
                                 aid: s
                                 for (tid, aid), s in input_target_sizes.items()
@@ -843,8 +940,8 @@ def get_ns(n: int):
                     )
                     for t in model.inputs
                 },
-                stat=test_inputs.stat,
-                id=test_inputs.id,
+                stat=test_input.stat,
+                id=test_input.id,
             )
             expected_output_shapes = {
                 t.id: {
@@ -857,7 +954,7 @@ def get_ns(n: int):
             yield n, batch_size, resized_test_inputs, expected_output_shapes
 
     try:
-        test_inputs = get_test_inputs(model)
+        test_input = get_test_input_sample(model)
 
         with create_prediction_pipeline(
             bioimageio_model=model, devices=devices, weight_format=weight_format
diff --git a/bioimageio/core/_settings.py b/src/bioimageio/core/_settings.py
similarity index 100%
rename from bioimageio/core/_settings.py
rename to src/bioimageio/core/_settings.py
diff --git a/bioimageio/core/axis.py b/src/bioimageio/core/axis.py
similarity index 100%
rename from bioimageio/core/axis.py
rename to src/bioimageio/core/axis.py
diff --git a/bioimageio/core/backends/__init__.py b/src/bioimageio/core/backends/__init__.py
similarity index 100%
rename from bioimageio/core/backends/__init__.py
rename to src/bioimageio/core/backends/__init__.py
diff --git a/bioimageio/core/backends/_model_adapter.py b/src/bioimageio/core/backends/_model_adapter.py
similarity index 97%
rename from bioimageio/core/backends/_model_adapter.py
rename to src/bioimageio/core/backends/_model_adapter.py
index db4d44e9..742d4912 100644
--- a/bioimageio/core/backends/_model_adapter.py
+++ b/src/bioimageio/core/backends/_model_adapter.py
@@ -1,4 +1,3 @@
-import sys
 import warnings
 from abc import ABC, abstractmethod
 from typing import (
@@ -11,6 +10,7 @@
     final,
 )
 
+from exceptiongroup import ExceptionGroup
 from numpy.typing import NDArray
 from typing_extensions import assert_never
 
@@ -173,10 +173,7 @@ def create(
                 "None of the weight format specific model adapters could be created"
                 + " in this environment."
             )
-            if sys.version_info[:2] >= (3, 11):
-                raise ExceptionGroup(msg, errors)
-            else:
-                raise ValueError(msg) from Exception(errors)
+            raise ExceptionGroup(msg, errors)
 
     @final
     def load(self, *, devices: Optional[Sequence[str]] = None) -> None:
diff --git a/bioimageio/core/backends/keras_backend.py b/src/bioimageio/core/backends/keras_backend.py
similarity index 100%
rename from bioimageio/core/backends/keras_backend.py
rename to src/bioimageio/core/backends/keras_backend.py
diff --git a/bioimageio/core/backends/onnx_backend.py b/src/bioimageio/core/backends/onnx_backend.py
similarity index 100%
rename from bioimageio/core/backends/onnx_backend.py
rename to src/bioimageio/core/backends/onnx_backend.py
diff --git a/bioimageio/core/backends/pytorch_backend.py b/src/bioimageio/core/backends/pytorch_backend.py
similarity index 100%
rename from bioimageio/core/backends/pytorch_backend.py
rename to src/bioimageio/core/backends/pytorch_backend.py
diff --git a/bioimageio/core/backends/tensorflow_backend.py b/src/bioimageio/core/backends/tensorflow_backend.py
similarity index 100%
rename from bioimageio/core/backends/tensorflow_backend.py
rename to src/bioimageio/core/backends/tensorflow_backend.py
diff --git a/bioimageio/core/backends/torchscript_backend.py b/src/bioimageio/core/backends/torchscript_backend.py
similarity index 100%
rename from bioimageio/core/backends/torchscript_backend.py
rename to src/bioimageio/core/backends/torchscript_backend.py
diff --git a/bioimageio/core/block.py b/src/bioimageio/core/block.py
similarity index 100%
rename from bioimageio/core/block.py
rename to src/bioimageio/core/block.py
diff --git a/bioimageio/core/block_meta.py b/src/bioimageio/core/block_meta.py
similarity index 100%
rename from bioimageio/core/block_meta.py
rename to src/bioimageio/core/block_meta.py
diff --git a/bioimageio/core/cli.py b/src/bioimageio/core/cli.py
similarity index 97%
rename from bioimageio/core/cli.py
rename to src/bioimageio/core/cli.py
index d5c49067..ff24f1ec 100644
--- a/bioimageio/core/cli.py
+++ b/src/bioimageio/core/cli.py
@@ -47,6 +47,7 @@
 from typing_extensions import assert_never
 
 import bioimageio.spec
+from bioimageio.core import __version__
 from bioimageio.spec import (
     AnyModelDescr,
     InvalidDescr,
@@ -79,7 +80,7 @@
 )
 from .sample import Sample
 from .stat_measures import Stat
-from .utils import VERSION, compare
+from .utils import compare
 from .weight_converters._add_weights import add_weights
 
 WEIGHT_FORMAT_ALIASES = AliasChoices(
@@ -99,10 +100,9 @@ class ArgMixin(BaseModel, use_attribute_docstrings=True, cli_implicit_flags=True
 
 
 class WithSummaryLogging(ArgMixin):
-    summary: Sequence[Union[Literal["display"], Path]] = Field(
-        ("display",),
+    summary: List[Union[Literal["display"], Path]] = Field(
+        default_factory=lambda: ["display"],
         examples=[
-            "display",
             Path("summary.md"),
             Path("bioimageio_summaries/"),
             ["display", Path("summary.md")],
@@ -181,7 +181,7 @@ class TestCmd(CmdBase, WithSource, WithSummaryLogging):
 
     (only relevant for model resources)"""
 
-    devices: Optional[Union[str, Sequence[str]]] = None
+    devices: Optional[List[str]] = None
     """Device(s) to use for testing"""
 
     runtime_env: Union[Literal["currently-active", "as-described"], Path] = Field(
@@ -396,8 +396,8 @@ def updated(self):
 class PredictCmd(CmdBase, WithSource):
     """Run inference on your data with a bioimage.io model."""
 
-    inputs: NotEmpty[Sequence[Union[str, NotEmpty[Tuple[str, ...]]]]] = (
-        "{input_id}/001.tif",
+    inputs: NotEmpty[List[Union[str, NotEmpty[List[str]]]]] = Field(
+        default_factory=lambda: ["{input_id}/001.tif"]
     )
     """Model input sample paths (for each input tensor)
 
@@ -485,7 +485,11 @@ def _example(self):
         example_inputs = (
             model_descr.sample_inputs
             if isinstance(model_descr, v0_4.ModelDescr)
-            else [ipt.sample_tensor or ipt.test_tensor for ipt in model_descr.inputs]
+            else [
+                t
+                for ipt in model_descr.inputs
+                if (t := ipt.sample_tensor or ipt.test_tensor)
+            ]
         )
         if not example_inputs:
             raise ValueError(f"{self.descr_id} does not specify any example inputs.")
@@ -561,7 +565,7 @@ def get_example_command(preview: bool, escape: bool = False):
         print(
             "🎉 Sucessfully ran example prediction!\n"
             + "To predict the example input using the CLI example config file"
-            + f" {example_path/YAML_FILE}, execute `bioimageio predict` from {example_path}:\n"
+            + f" {example_path / YAML_FILE}, execute `bioimageio predict` from {example_path}:\n"
             + f"$ cd {str(example_path)}\n"
             + f'$ bioimageio predict "{source_escaped}"\n\n'
             + "Alternatively run the following command"
@@ -589,7 +593,7 @@ def run(self):
             for ipt in model_descr.inputs
         )
 
-        def expand_inputs(i: int, ipt: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
+        def expand_inputs(i: int, ipt: Union[str, Sequence[str]]) -> Tuple[str, ...]:
             if isinstance(ipt, str):
                 ipts = tuple(
                     ipt.format(model_id=self.descr_id, input_id=t) for t in input_ids
@@ -750,6 +754,10 @@ class AddWeightsCmd(CmdBase, WithSource, WithSummaryLogging):
     verbose: bool = False
     """Log more (error) output."""
 
+    tracing: bool = True
+    """Allow tracing when converting pytorch_state_dict to torchscript
+    (still uses scripting if possible)."""
+
     def run(self):
         model_descr = ensure_description_is_model(self.descr)
         if isinstance(model_descr, v0_4.ModelDescr):
@@ -763,10 +771,8 @@ def run(self):
             source_format=self.source_format,
             target_format=self.target_format,
             verbose=self.verbose,
+            allow_tracing=self.tracing,
         )
-        if updated_model_descr is None:
-            return
-
         self.log(updated_model_descr)
 
 
@@ -865,7 +871,7 @@ def run(self):
 Bioimageio.__doc__ += f"""
 
 library versions:
-  bioimageio.core {VERSION}
+  bioimageio.core {__version__}
   bioimageio.spec {bioimageio.spec.__version__}
 
 spec format versions:
diff --git a/bioimageio/core/commands.py b/src/bioimageio/core/commands.py
similarity index 84%
rename from bioimageio/core/commands.py
rename to src/bioimageio/core/commands.py
index 36982854..61d0bd4b 100644
--- a/bioimageio/core/commands.py
+++ b/src/bioimageio/core/commands.py
@@ -6,7 +6,6 @@
 
 from typing_extensions import Literal
 
-from bioimageio.core.common import SupportedWeightsFormat
 from bioimageio.spec import (
     InvalidDescr,
     ResourceDescr,
@@ -17,8 +16,27 @@
 
 from ._resource_tests import test_description
 
-WeightFormatArgAll = Literal[SupportedWeightsFormat, "all"]
-WeightFormatArgAny = Literal[SupportedWeightsFormat, "any"]
+# unfortunately this does not work with py3.9 and pydantic 2.11
+# from bioimageio.core.common import SupportedWeightsFormat
+# WeightFormatArgAll = Literal[SupportedWeightsFormat, "all"]
+# WeightFormatArgAny = Literal[SupportedWeightsFormat, "any"]
+# so we write out the literal explicitly
+WeightFormatArgAll = Literal[
+    "keras_hdf5",
+    "onnx",
+    "pytorch_state_dict",
+    "tensorflow_saved_model_bundle",
+    "torchscript",
+    "all",
+]
+WeightFormatArgAny = Literal[
+    "keras_hdf5",
+    "onnx",
+    "pytorch_state_dict",
+    "tensorflow_saved_model_bundle",
+    "torchscript",
+    "any",
+]
 
 
 def test(
diff --git a/bioimageio/core/common.py b/src/bioimageio/core/common.py
similarity index 100%
rename from bioimageio/core/common.py
rename to src/bioimageio/core/common.py
diff --git a/bioimageio/core/dataset.py b/src/bioimageio/core/dataset.py
similarity index 100%
rename from bioimageio/core/dataset.py
rename to src/bioimageio/core/dataset.py
diff --git a/bioimageio/core/digest_spec.py b/src/bioimageio/core/digest_spec.py
similarity index 87%
rename from bioimageio/core/digest_spec.py
rename to src/bioimageio/core/digest_spec.py
index 6a10b645..4b86c64e 100644
--- a/bioimageio/core/digest_spec.py
+++ b/src/bioimageio/core/digest_spec.py
@@ -84,6 +84,10 @@ def import_callable(
     return c
 
 
+tmp_dirs_in_use: List[TemporaryDirectory[str]] = []
+"""keep global reference to temporary directories created during import to delay cleanup"""
+
+
 def _import_from_file_impl(
     source: FileSource, callable_name: str, **kwargs: Unpack[HashKwargs]
 ):
@@ -108,7 +112,17 @@ def _import_from_file_impl(
     module = sys.modules.get(module_name)
     if module is None:
         try:
-            tmp_dir = TemporaryDirectory(ignore_cleanup_errors=True)
+            td_kwargs: Dict[str, Any] = (
+                dict(ignore_cleanup_errors=True) if sys.version_info >= (3, 10) else {}
+            )
+            if sys.version_info >= (3, 12):
+                td_kwargs["delete"] = False
+
+            tmp_dir = TemporaryDirectory(**td_kwargs)
+            # keep global ref to tmp_dir to delay cleanup until program exit
+            # TODO: remove for py >= 3.12, when delete=False works
+            tmp_dirs_in_use.append(tmp_dir)
+
             module_path = Path(tmp_dir.name) / module_name
             if reader.original_file_name.endswith(".zip") or is_zipfile(reader):
                 module_path.mkdir()
@@ -204,36 +218,54 @@ def get_member_ids(
     return [get_member_id(descr) for descr in tensor_descriptions]
 
 
-def get_test_inputs(model: AnyModelDescr) -> Sample:
-    """returns a model's test input sample"""
-    member_ids = get_member_ids(model.inputs)
-    if isinstance(model, v0_4.ModelDescr):
-        arrays = [load_array(tt) for tt in model.test_inputs]
-    else:
-        arrays = [load_array(d.test_tensor) for d in model.inputs]
-
-    axes = [get_axes_infos(t) for t in model.inputs]
-    return Sample(
-        members={
-            m: Tensor.from_numpy(arr, dims=ax)
-            for m, arr, ax in zip(member_ids, arrays, axes)
-        },
-        stat={},
-        id="test-sample",
+def get_test_input_sample(model: AnyModelDescr) -> Sample:
+    return _get_test_sample(
+        model.inputs,
+        model.test_inputs if isinstance(model, v0_4.ModelDescr) else model.inputs,
     )
 
 
-def get_test_outputs(model: AnyModelDescr) -> Sample:
+get_test_inputs = get_test_input_sample
+"""DEPRECATED: use `get_test_input_sample` instead"""
+
+
+def get_test_output_sample(model: AnyModelDescr) -> Sample:
     """returns a model's test output sample"""
-    member_ids = get_member_ids(model.outputs)
+    return _get_test_sample(
+        model.outputs,
+        model.test_outputs if isinstance(model, v0_4.ModelDescr) else model.outputs,
+    )
+
+
+get_test_outputs = get_test_output_sample
+"""DEPRECATED: use `get_test_input_sample` instead"""
 
-    if isinstance(model, v0_4.ModelDescr):
-        arrays = [load_array(tt) for tt in model.test_outputs]
-    else:
-        arrays = [load_array(d.test_tensor) for d in model.outputs]
 
-    axes = [get_axes_infos(t) for t in model.outputs]
+def _get_test_sample(
+    tensor_descrs: Sequence[
+        Union[
+            v0_4.InputTensorDescr,
+            v0_4.OutputTensorDescr,
+            v0_5.InputTensorDescr,
+            v0_5.OutputTensorDescr,
+        ]
+    ],
+    test_sources: Sequence[Union[FileSource, v0_5.TensorDescr]],
+) -> Sample:
+    """returns a model's input/output test sample"""
+    member_ids = get_member_ids(tensor_descrs)
+    arrays: List[NDArray[Any]] = []
+    for src in test_sources:
+        if isinstance(src, (v0_5.InputTensorDescr, v0_5.OutputTensorDescr)):
+            if src.test_tensor is None:
+                raise ValueError(
+                    f"Model input '{src.id}' has no test tensor defined, cannot create test sample."
+                )
+            arrays.append(load_array(src.test_tensor))
+        else:
+            arrays.append(load_array(src))
 
+    axes = [get_axes_infos(t) for t in tensor_descrs]
     return Sample(
         members={
             m: Tensor.from_numpy(arr, dims=ax)
diff --git a/bioimageio/core/io.py b/src/bioimageio/core/io.py
similarity index 100%
rename from bioimageio/core/io.py
rename to src/bioimageio/core/io.py
diff --git a/bioimageio/core/model_adapters.py b/src/bioimageio/core/model_adapters.py
similarity index 100%
rename from bioimageio/core/model_adapters.py
rename to src/bioimageio/core/model_adapters.py
diff --git a/bioimageio/core/prediction.py b/src/bioimageio/core/prediction.py
similarity index 100%
rename from bioimageio/core/prediction.py
rename to src/bioimageio/core/prediction.py
diff --git a/bioimageio/core/proc_ops.py b/src/bioimageio/core/proc_ops.py
similarity index 91%
rename from bioimageio/core/proc_ops.py
rename to src/bioimageio/core/proc_ops.py
index e504bf07..95f7466a 100644
--- a/bioimageio/core/proc_ops.py
+++ b/src/bioimageio/core/proc_ops.py
@@ -13,6 +13,7 @@
 )
 
 import numpy as np
+import scipy  # pyright: ignore[reportMissingTypeStubs]
 import xarray as xr
 from typing_extensions import Self, assert_never
 
@@ -103,7 +104,7 @@ def __call__(self, sample: Union[Sample, SampleBlock]) -> None:
             assert_never(sample)
 
     @abstractmethod
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor: ...
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor: ...
 
 
 @dataclass
@@ -200,8 +201,8 @@ class Binarize(_SimpleOperator):
     threshold: Union[float, Sequence[float]]
     axis: Optional[AxisId] = None
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return input > self.threshold
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return x > self.threshold
 
     def get_output_shape(
         self, input_shape: Mapping[AxisId, int]
@@ -240,8 +241,8 @@ def __post_init__(self):
             self.min is None or self.max is None or self.min < self.max
         ), f"expected min < max, but {self.min} !< {self.max}"
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return input.clip(self.min, self.max)
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return x.clip(self.min, self.max)
 
     def get_output_shape(
         self, input_shape: Mapping[AxisId, int]
@@ -276,8 +277,8 @@ def get_output_shape(
     ) -> Mapping[AxisId, int]:
         return input_shape
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return input.astype(self.dtype)
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return x.astype(self.dtype)
 
 
 @dataclass
@@ -288,8 +289,8 @@ class ScaleLinear(_SimpleOperator):
     offset: Union[float, xr.DataArray] = 0.0
     """additive term"""
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return input * self.gain + self.offset
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return x * self.gain + self.offset
 
     def get_output_shape(
         self, input_shape: Mapping[AxisId, int]
@@ -365,12 +366,12 @@ def __post_init__(self):
         self.ref_mean = Mean(member_id=ref_tensor, axes=axes)
         self.ref_std = Std(member_id=ref_tensor, axes=axes)
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
         mean = stat[self.mean]
         std = stat[self.std] + self.eps
         ref_mean = stat[self.ref_mean]
         ref_std = stat[self.ref_std] + self.eps
-        return (input - mean) / std * ref_std + ref_mean
+        return (x - mean) / std * ref_std + ref_mean
 
     def get_output_shape(
         self, input_shape: Mapping[AxisId, int]
@@ -484,10 +485,10 @@ def from_proc_descr(
             ),
         )
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
         lower = stat[self.lower]
         upper = stat[self.upper]
-        return (input - lower) / (upper - lower + self.eps)
+        return (x - lower) / (upper - lower + self.eps)
 
     def get_descr(self):
         assert self.lower.axes == self.upper.axes
@@ -508,8 +509,8 @@ def get_descr(self):
 class Sigmoid(_SimpleOperator):
     """1 / (1 + e^(-input))."""
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return Tensor(1.0 / (1.0 + np.exp(-input)), dims=input.dims)
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return Tensor(1.0 / (1.0 + np.exp(-x)), dims=x.dims)
 
     @property
     def required_measures(self) -> Collection[Measure]:
@@ -531,6 +532,36 @@ def get_descr(self):
         return v0_5.SigmoidDescr()
 
 
+@dataclass
+class Softmax(_SimpleOperator):
+    """Softmax activation function."""
+
+    axis: AxisId = AxisId("channel")
+
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        axis_idx = x.dims.index(self.axis)
+        result = scipy.special.softmax(x.data, axis=axis_idx)
+        result_xr = xr.DataArray(result, dims=x.dims)
+        return Tensor.from_xarray(result_xr)
+
+    @property
+    def required_measures(self) -> Collection[Measure]:
+        return {}
+
+    def get_output_shape(
+        self, input_shape: Mapping[AxisId, int]
+    ) -> Mapping[AxisId, int]:
+        return input_shape
+
+    @classmethod
+    def from_proc_descr(cls, descr: v0_5.SoftmaxDescr, member_id: MemberId) -> Self:
+        assert isinstance(descr, v0_5.SoftmaxDescr)
+        return cls(input=member_id, output=member_id, axis=descr.kwargs.axis)
+
+    def get_descr(self):
+        return v0_5.SoftmaxDescr(kwargs=v0_5.SoftmaxKwargs(axis=self.axis))
+
+
 @dataclass
 class ZeroMeanUnitVariance(_SimpleOperator):
     """normalize to zero mean, unit variance."""
@@ -574,10 +605,10 @@ def from_proc_descr(
             std=Std(axes=axes, member_id=member_id),
         )
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
         mean = stat[self.mean]
         std = stat[self.std]
-        return (input - mean) / (std + self.eps)
+        return (x - mean) / (std + self.eps)
 
     def get_descr(self):
         return v0_5.ZeroMeanUnitVarianceDescr(
@@ -641,8 +672,8 @@ def get_descr(self):
 
         return v0_5.FixedZeroMeanUnitVarianceDescr(kwargs=kwargs)
 
-    def _apply(self, input: Tensor, stat: Stat) -> Tensor:
-        return (input - self.mean) / (self.std + self.eps)
+    def _apply(self, x: Tensor, stat: Stat) -> Tensor:
+        return (x - self.mean) / (self.std + self.eps)
 
 
 ProcDescr = Union[
@@ -662,6 +693,7 @@ def _apply(self, input: Tensor, stat: Stat) -> Tensor:
     ScaleMeanVariance,
     ScaleRange,
     Sigmoid,
+    Softmax,
     UpdateStats,
     ZeroMeanUnitVariance,
 ]
@@ -715,5 +747,7 @@ def get_proc(
         (v0_4.ZeroMeanUnitVarianceDescr, v0_5.ZeroMeanUnitVarianceDescr),
     ):
         return ZeroMeanUnitVariance.from_proc_descr(proc_descr, member_id)
+    elif isinstance(proc_descr, v0_5.SoftmaxDescr):
+        return Softmax.from_proc_descr(proc_descr, member_id)
     else:
         assert_never(proc_descr)
diff --git a/bioimageio/core/proc_setup.py b/src/bioimageio/core/proc_setup.py
similarity index 100%
rename from bioimageio/core/proc_setup.py
rename to src/bioimageio/core/proc_setup.py
diff --git a/bioimageio/core/py.typed b/src/bioimageio/core/py.typed
similarity index 100%
rename from bioimageio/core/py.typed
rename to src/bioimageio/core/py.typed
diff --git a/bioimageio/core/sample.py b/src/bioimageio/core/sample.py
similarity index 100%
rename from bioimageio/core/sample.py
rename to src/bioimageio/core/sample.py
diff --git a/bioimageio/core/stat_calculators.py b/src/bioimageio/core/stat_calculators.py
similarity index 99%
rename from bioimageio/core/stat_calculators.py
rename to src/bioimageio/core/stat_calculators.py
index 515fe843..ce904068 100644
--- a/bioimageio/core/stat_calculators.py
+++ b/src/bioimageio/core/stat_calculators.py
@@ -49,7 +49,7 @@
 from .tensor import Tensor
 
 try:
-    import crick  # pyright: ignore[reportMissingImports]
+    import crick  # pyright: ignore[reportMissingTypeStubs]
 
 except Exception:
     crick = None
diff --git a/bioimageio/core/stat_measures.py b/src/bioimageio/core/stat_measures.py
similarity index 100%
rename from bioimageio/core/stat_measures.py
rename to src/bioimageio/core/stat_measures.py
diff --git a/bioimageio/core/tensor.py b/src/bioimageio/core/tensor.py
similarity index 99%
rename from bioimageio/core/tensor.py
rename to src/bioimageio/core/tensor.py
index 9d69e970..408865de 100644
--- a/bioimageio/core/tensor.py
+++ b/src/bioimageio/core/tensor.py
@@ -45,6 +45,7 @@
 
 
 # TODO: complete docstrings
+# TODO: in the long run---with improved typing in xarray---we should probably replace `Tensor` with xr.DataArray
 class Tensor(MagicTensorOpsMixin):
     """A wrapper around an xr.DataArray for better integration with bioimageio.spec
     and improved type annotations."""
diff --git a/src/bioimageio/core/utils/__init__.py b/src/bioimageio/core/utils/__init__.py
new file mode 100644
index 00000000..62120894
--- /dev/null
+++ b/src/bioimageio/core/utils/__init__.py
@@ -0,0 +1 @@
+from ._compare import compare as compare
diff --git a/bioimageio/core/utils/_compare.py b/src/bioimageio/core/utils/_compare.py
similarity index 100%
rename from bioimageio/core/utils/_compare.py
rename to src/bioimageio/core/utils/_compare.py
diff --git a/bioimageio/core/utils/_type_guards.py b/src/bioimageio/core/utils/_type_guards.py
similarity index 100%
rename from bioimageio/core/utils/_type_guards.py
rename to src/bioimageio/core/utils/_type_guards.py
diff --git a/bioimageio/core/weight_converters/__init__.py b/src/bioimageio/core/weight_converters/__init__.py
similarity index 100%
rename from bioimageio/core/weight_converters/__init__.py
rename to src/bioimageio/core/weight_converters/__init__.py
diff --git a/bioimageio/core/weight_converters/_add_weights.py b/src/bioimageio/core/weight_converters/_add_weights.py
similarity index 83%
rename from bioimageio/core/weight_converters/_add_weights.py
rename to src/bioimageio/core/weight_converters/_add_weights.py
index 978c8450..387a3004 100644
--- a/bioimageio/core/weight_converters/_add_weights.py
+++ b/src/bioimageio/core/weight_converters/_add_weights.py
@@ -1,5 +1,5 @@
 import traceback
-from typing import Optional
+from typing import Optional, Union
 
 from loguru import logger
 from pydantic import DirectoryPath
@@ -21,7 +21,8 @@ def add_weights(
     source_format: Optional[WeightsFormat] = None,
     target_format: Optional[WeightsFormat] = None,
     verbose: bool = False,
-) -> Optional[ModelDescr]:
+    allow_tracing: bool = True,
+) -> Union[ModelDescr, InvalidDescr]:
     """Convert model weights to other formats and add them to the model description
 
     Args:
@@ -34,8 +35,8 @@ def add_weights(
         verbose: log more (error) output
 
     Returns:
-        - An updated model description if any converted weights were added.
-        - `None` if no conversion was possible.
+        A (potentially invalid) model copy stored at `output_path` with added weights if any conversion was possible.
+
     """
     if not isinstance(model_descr, ModelDescr):
         if model_descr.type == "model" and not isinstance(model_descr, InvalidDescr):
@@ -51,10 +52,9 @@ def add_weights(
         model_descr, output_path=output_path
     )
     # reload from local folder to make sure we do not edit the given model
-    _model_descr = load_model_description(output_path, perform_io_checks=False)
-    assert isinstance(_model_descr, ModelDescr)
-    model_descr = _model_descr
-    del _model_descr
+    model_descr = load_model_description(
+        output_path, perform_io_checks=False, format_version="latest"
+    )
 
     if source_format is None:
         available = set(model_descr.weights.available_formats)
@@ -83,14 +83,14 @@ def add_weights(
             )
         except Exception as e:
             if verbose:
-                traceback.print_exception(e)
+                traceback.print_exception(type(e), e, e.__traceback__)
 
             logger.error(e)
         else:
             available.add("torchscript")
             missing.discard("torchscript")
 
-    if "pytorch_state_dict" in available and "torchscript" in missing:
+    if allow_tracing and "pytorch_state_dict" in available and "torchscript" in missing:
         logger.info(
             "Attempting to convert 'pytorch_state_dict' weights to 'torchscript' by tracing."
         )
@@ -106,7 +106,7 @@ def add_weights(
             )
         except Exception as e:
             if verbose:
-                traceback.print_exception(e)
+                traceback.print_exception(type(e), e, e.__traceback__)
 
             logger.error(e)
         else:
@@ -125,7 +125,7 @@ def add_weights(
             )
         except Exception as e:
             if verbose:
-                traceback.print_exception(e)
+                traceback.print_exception(type(e), e, e.__traceback__)
 
             logger.error(e)
         else:
@@ -146,7 +146,7 @@ def add_weights(
             )
         except Exception as e:
             if verbose:
-                traceback.print_exception(e)
+                traceback.print_exception(type(e), e, e.__traceback__)
 
             logger.error(e)
         else:
@@ -163,11 +163,17 @@ def add_weights(
 
     if originally_missing == missing:
         logger.warning("failed to add any converted weights")
-        return None
+        return model_descr
     else:
         logger.info("added weights formats {}", originally_missing - missing)
         # resave model with updated rdf.yaml
         _ = save_bioimageio_package_as_folder(model_descr, output_path=output_path)
-        tested_model_descr = load_description_and_test(model_descr)
-        assert isinstance(tested_model_descr, ModelDescr)
+        tested_model_descr = load_description_and_test(
+            model_descr, format_version="latest", expected_type="model"
+        )
+        if not isinstance(tested_model_descr, ModelDescr):
+            logger.error(
+                f"The updated model description at {output_path} did not pass testing."
+            )
+
         return tested_model_descr
diff --git a/bioimageio/core/weight_converters/_utils_onnx.py b/src/bioimageio/core/weight_converters/_utils_onnx.py
similarity index 100%
rename from bioimageio/core/weight_converters/_utils_onnx.py
rename to src/bioimageio/core/weight_converters/_utils_onnx.py
diff --git a/bioimageio/core/weight_converters/keras_to_tensorflow.py b/src/bioimageio/core/weight_converters/keras_to_tensorflow.py
similarity index 95%
rename from bioimageio/core/weight_converters/keras_to_tensorflow.py
rename to src/bioimageio/core/weight_converters/keras_to_tensorflow.py
index 09f54344..ce93347f 100644
--- a/bioimageio/core/weight_converters/keras_to_tensorflow.py
+++ b/src/bioimageio/core/weight_converters/keras_to_tensorflow.py
@@ -1,8 +1,9 @@
 import os
 import shutil
+import sys
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Union, no_type_check
+from typing import Any, Union, no_type_check
 from zipfile import ZipFile
 
 import tensorflow  # pyright: ignore[reportMissingTypeStubs]
@@ -79,7 +80,10 @@ def convert(
                 f"Tensorflow major versions of model {model_tf_major_ver} is not {tf_major_ver}"
             )
 
-    with TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
+    td_kwargs: dict[str, Any] = (
+        dict(ignore_cleanup_errors=True) if sys.version_info >= (3, 10) else {}
+    )
+    with TemporaryDirectory(**td_kwargs) as temp_dir:
         local_weights = ensure_unzipped(
             weight_reader, Path(temp_dir) / "bioimageio_unzipped_tf_weights"
         )
@@ -119,7 +123,7 @@ def _convert_tf2(
     print("TensorFlow model exported to", output_path)
 
     return TensorflowSavedModelBundleWeightsDescr(
-        source=output_path,
+        source=output_path.absolute(),
         parent="keras_hdf5",
         tensorflow_version=Version(tensorflow.__version__),
         comment=f"Converted with bioimageio.core {__version__}.",
@@ -134,7 +138,6 @@ def _convert_tf1(
     input_name: str,
     output_name: str,
 ) -> TensorflowSavedModelBundleWeightsDescr:
-
     @no_type_check
     def build_tf_model():
         keras_model = keras.models.load_model(keras_weight_path)
@@ -163,7 +166,7 @@ def build_tf_model():
     print("TensorFlow model exported to", output_path)
 
     return TensorflowSavedModelBundleWeightsDescr(
-        source=output_path,
+        source=output_path.absolute(),
         parent="keras_hdf5",
         tensorflow_version=Version(tensorflow.__version__),
         comment=f"Converted with bioimageio.core {__version__}.",
diff --git a/bioimageio/core/weight_converters/pytorch_to_onnx.py b/src/bioimageio/core/weight_converters/pytorch_to_onnx.py
similarity index 94%
rename from bioimageio/core/weight_converters/pytorch_to_onnx.py
rename to src/bioimageio/core/weight_converters/pytorch_to_onnx.py
index 72d819b1..1627d2f8 100644
--- a/bioimageio/core/weight_converters/pytorch_to_onnx.py
+++ b/src/bioimageio/core/weight_converters/pytorch_to_onnx.py
@@ -6,7 +6,7 @@
 
 from .. import __version__
 from ..backends.pytorch_backend import load_torch_model
-from ..digest_spec import get_member_id, get_test_inputs
+from ..digest_spec import get_member_id, get_test_input_sample
 from ..proc_setup import get_pre_and_postprocessing
 from ._utils_onnx import get_dynamic_axes
 
@@ -45,7 +45,7 @@ def convert(
             "The provided model does not have weights in the pytorch state dict format"
         )
 
-    sample = get_test_inputs(model_descr)
+    sample = get_test_input_sample(model_descr)
     procs = get_pre_and_postprocessing(
         model_descr, dataset_for_initial_statistics=[sample]
     )
@@ -72,7 +72,7 @@ def convert(
         )
 
     return OnnxWeightsDescr(
-        source=output_path,
+        source=output_path.absolute(),
         parent="pytorch_state_dict",
         opset_version=opset_version,
         comment=f"Converted with bioimageio.core {__version__}.",
diff --git a/bioimageio/core/weight_converters/pytorch_to_torchscript.py b/src/bioimageio/core/weight_converters/pytorch_to_torchscript.py
similarity index 98%
rename from bioimageio/core/weight_converters/pytorch_to_torchscript.py
rename to src/bioimageio/core/weight_converters/pytorch_to_torchscript.py
index 3d0f281c..5660ebc8 100644
--- a/bioimageio/core/weight_converters/pytorch_to_torchscript.py
+++ b/src/bioimageio/core/weight_converters/pytorch_to_torchscript.py
@@ -60,7 +60,7 @@ def convert(
     scripted_model.save(output_path)
 
     return TorchscriptWeightsDescr(
-        source=output_path,
+        source=output_path.absolute(),
         pytorch_version=Version(torch.__version__),
         parent="pytorch_state_dict",
         comment=(
diff --git a/bioimageio/core/weight_converters/torchscript_to_onnx.py b/src/bioimageio/core/weight_converters/torchscript_to_onnx.py
similarity index 94%
rename from bioimageio/core/weight_converters/torchscript_to_onnx.py
rename to src/bioimageio/core/weight_converters/torchscript_to_onnx.py
index 774e2875..aa695cbb 100644
--- a/bioimageio/core/weight_converters/torchscript_to_onnx.py
+++ b/src/bioimageio/core/weight_converters/torchscript_to_onnx.py
@@ -5,7 +5,7 @@
 from bioimageio.spec.model.v0_5 import ModelDescr, OnnxWeightsDescr
 
 from .. import __version__
-from ..digest_spec import get_member_id, get_test_inputs
+from ..digest_spec import get_member_id, get_test_input_sample
 from ..proc_setup import get_pre_and_postprocessing
 from ._utils_onnx import get_dynamic_axes
 
@@ -44,7 +44,7 @@ def convert(
             "The provided model does not have weights in the torchscript format"
         )
 
-    sample = get_test_inputs(model_descr)
+    sample = get_test_input_sample(model_descr)
     procs = get_pre_and_postprocessing(
         model_descr, dataset_for_initial_statistics=[sample]
     )
@@ -76,7 +76,7 @@ def convert(
         )
 
     return OnnxWeightsDescr(
-        source=output_path,
+        source=output_path.absolute(),
         parent="torchscript",
         opset_version=opset_version,
         comment=f"Converted with bioimageio.core {__version__}.",
diff --git a/tests/test_bioimageio_collection.py b/tests/test_bioimageio_collection.py
index 84561eb1..a87f7d99 100644
--- a/tests/test_bioimageio_collection.py
+++ b/tests/test_bioimageio_collection.py
@@ -1,168 +1,36 @@
 import os
 from itertools import chain
 from pathlib import Path
-from typing import Any, Dict, Iterable, Mapping, Tuple
+from typing import Iterable, Mapping, Tuple
 
-import httpx
 import pytest
-from pydantic import HttpUrl
-
 from bioimageio.spec import InvalidDescr, settings
 from bioimageio.spec.common import Sha256
-from tests.utils import ParameterSet, expensive_test
-
-BASE_URL = "https://uk1s3.embassy.ebi.ac.uk/public-datasets/bioimage.io/"
-
-
-def _get_latest_rdf_sources():
-    entries: Any = httpx.get(BASE_URL + "all_versions.json").json()["entries"]
-    ret: Dict[str, Tuple[HttpUrl, Sha256]] = {}
-    for entry in entries:
-        version = entry["versions"][0]
-        ret[f"{entry['concept']}/{version['v']}"] = (
-            HttpUrl(version["source"]),
-            Sha256(version["sha256"]),
-        )
-
-    return ret
+from pydantic import HttpUrl
 
+from tests.utils import ParameterSet, expensive_test
 
-ALL_LATEST_RDF_SOURCES: Mapping[str, Tuple[HttpUrl, Sha256]] = _get_latest_rdf_sources()
+TEST_RDF_SOURCES: Mapping[str, Tuple[HttpUrl, Sha256]] = {
+    # "affable-shark": (  # TODO: enable when updated with fixed torchscript and onnx weights
+    #     HttpUrl(
+    #         "https://hypha.aicell.io/bioimage-io/artifacts/affable-shark/files/rdf.yaml?version=v0"
+    #     ),
+    #     Sha256("b74944b4949591d3eaf231cf9ab259f91dec679863020178e6c3ddadd52a019c"),
+    # ),
+    "ambitious-sloth": (
+        HttpUrl(
+            "https://hypha.aicell.io/bioimage-io/artifacts/ambitious-sloth/files/rdf.yaml?version=v0"
+        ),
+        Sha256("caf162e847a0812fb7704e7848b1ee68f46383278d8b74493553fe96750d1e39"),
+    ),
+}
 
 
 def yield_bioimageio_yaml_urls() -> Iterable[ParameterSet]:
-    for descr_url, sha in ALL_LATEST_RDF_SOURCES.values():
-        key = (
-            str(descr_url)
-            .replace(BASE_URL, "")
-            .replace("/files/rdf.yaml", "")
-            .replace("/files/bioimageio.yaml", "")
-        )
+    for key, (descr_url, sha) in TEST_RDF_SOURCES.items():
         yield pytest.param(descr_url, sha, key, id=key)
 
 
-KNOWN_INVALID: Mapping[str, str] = {
-    "affectionate-cow/0.1.0": "custom dependencies",
-    "ambitious-sloth/1.2": "requires inferno",
-    "appealing-popcorn/1": "missing license",
-    "appetizing-eggplant/1": "missing license",
-    "appetizing-peach/1": "missing license",
-    "authoritative-ballet-shoes/1.13.1": "invalid id",
-    "biapy/biapy/1": "invalid github user arratemunoz and lmescu",
-    "bitter-hot-dog/1": "missing license",
-    "bold-shorts/1.13": "invalid id",
-    "brisk-scarf/1.16.2": "missing license",
-    "buttery-apple/1": "missing cite",
-    "buttery-sandwich/1": "missing license",
-    "cheerful-cap/1.15.3": "missing license",
-    "chewy-garlic/1": "missing license",
-    "classy-googles/1": "missing license",
-    "committed-turkey/1.2": "error deserializing VarianceScaling",
-    "convenient-purse/1.14.1": "missing license",
-    "convenient-t-shirt/1.14.1": "missing license",
-    "cozy-hiking-boot/1.16.2": "missing license",
-    "creative-panda/1": "error deserializing Conv2D",
-    "crunchy-cookie/1": "missing license",
-    "dazzling-spider/0.1.0": "requires careamics",
-    "delectable-eggplant/1": "missing license",
-    "delicious-cheese/1": "missing license",
-    "determined-hedgehog/1": "wrong output shape?",
-    "discreet-rooster/1": "error deserializing VarianceScaling",
-    "discreete-rooster/1": "error deserializing VarianceScaling",
-    "divine-paella/1": "missing license",
-    "dl4miceverywhere/DL4MicEverywhere/1": "invalid id",
-    "dynamic-t-rex/1": "needs update to 0.5 for scale_linear with axes processing",
-    "easy-going-sauropod/1": (
-        "CPU implementation of Conv3D currently only supports the NHWC tensor format."
-    ),
-    "efficient-chipmunk/1": "needs plantseg",
-    "emotional-cricket/1.1": "sporadic 403 responses from  https://elifesciences.org",
-    "exciting-backpack/1.19.1": "missing license",
-    "exquisite-curry/1": "missing license",
-    "famous-fish/0.1.0": "list index out of range `fl[3]`",
-    "fiji/Fiji/1": "invalid id",
-    "flattering-bikini/1.13.2": "missing license",
-    "flexible-helmet/1.14.1": "missing license",
-    "fluffy-popcorn/1": "missing license",
-    "fluid-glasses/1.17.2": "missing license",
-    "fruity-sushi/1": "missing license",
-    "fun-high-heels/1.15.2": "missing license",
-    "funny-butterfly/1": "Do not specify an axis for scalar gain and offset values.",
-    "greedy-whale/1": "batch size is actually limited to 1",
-    "happy-elephant/0.1.0": "list index out of range `fl[3]`",
-    "happy-honeybee/0.1.0": "requires biapy",
-    "heroic-otter/0.1.0": "requires biapy",
-    "hpa/HPA-Classification/1": "invalid id",
-    "humorous-crab/1": "batch size is actually limited to 1",
-    "humorous-fox/0.1.0": "requires careamics",
-    "humorous-owl/1": "error deserializing GlorotUniform",
-    "icy/icy/1": "invalid github user 'None'",
-    "idealistic-turtle/0.1.0": "requires biapy",
-    "imjoy/BioImageIO-Packager/1": "invalid id",
-    "imjoy/GenericBioEngineApp/1": "invalid documentation suffix",
-    "imjoy/HPA-Single-Cell/1": "invalid documentation suffix",
-    "imjoy/ImageJ.JS/1": "invalid documentation suffix",
-    "imjoy/ImJoy/1": "invalid documentation suffix",
-    "imjoy/vizarr/1": "invalid documentation suffix",
-    "impartial-shark/1": "error deserializing VarianceScaling",
-    "indulgent-sandwich/1": "missing license",
-    "inspiring-sandal/1.13.3": "missing license",
-    "intelligent-lion/0.1.0": "requires biapy",
-    "irresistible-swimsuit/1.14.1": "missing license",
-    "joyful-deer/1": "needs update to 0.5 for scale_linear with axes processing",
-    "joyful-top-hat/2.2.1": "missing license",
-    "juicy-peanut/1": "missing license",
-    "light-swimsuit/1.13": "missing license",
-    "limited-edition-crown/1.14.1": "missing license",
-    "lively-t-shirt/1.13": "missing license",
-    "lucky-fox/1": (
-        "torchscript runtime errro: Given groups=1, weight of size [90, 1, 3, 3], expected input[1, 2, 64, 64] to have 1 channels, but got 2 channels instead"
-    ),
-    "luscious-tomato/1": "missing license",
-    "mellow-broccoli/1": "missing license",
-    "mellow-takeout/1": "missing cite",
-    "merry-water-buffalo/0.1.0": "requires biapy",
-    "mesmerizing-shoe/1.14.1": "missing license",
-    "modest-spider/0.1.1": "non-batch id 'b'",
-    "naked-microbe/1": "unknown layer Convolution2D",
-    "nice-peacock/1": "invalid id",
-    "noisy-ox/1": "batch size is actually limited to 1",
-    "non-judgemental-eagle/1": "error deserializing GlorotUniform",
-    "nutty-burrito/1": "missing license",
-    "nutty-knuckle/1": "missing license",
-    "opalescent-ribbon/1.15.3": "missing license",
-    "palatable-curry/1": "missing license",
-    "polished-t-shirt/1.16.2": "missing license",
-    "powerful-sandal/1": "missing license",
-    "regal-ribbon/1.14.1": "missing license",
-    "resourceful-potato/1": "missing license",
-    "resplendent-ribbon/2.2.1": "missing license",
-    "rich-burrito/1": "missing license",
-    "rich-cheese/1": "missing license",
-    "savory-cheese/1": "missing license",
-    "silky-shorts/1.13": "missing license",
-    "slinky-bikini/1.15.1": "missing license",
-    "smooth-graduation-hat/1.15.0": "missing license",
-    "smooth-hat/1.1.0": "invalid id",
-    "smooth-safety-vest/1.14.1": "missing license, invalid id",
-    "smooth-scarf/1": "invalid id",
-    "sparkling-sari/1.0.0": "missing license, invalid id",
-    "straightforward-crocodile/1": (
-        "needs update to 0.5 for scale_linear with axes processing"
-    ),
-    "striking-necktie/1.14.1": "invalid id",
-    "stupendous-sheep/1.1": "requires relativ import of attachment",
-    "sympathetic-mosquito/1": "error deserializing VarianceScaling",
-    "tempting-pizza/1": "missing license",
-    "timeless-running-shirt/1.13.2": "invalid id, missing license",
-    "uplifting-backpack/1.14.1": "invalid id, missing license",
-    "venomous-swan/0.1.0": "requires biapy",
-    "whimsical-helmet/2.1.2": "invalid id",
-    "wild-rhino/0.1.0": "requires careamics",
-    "zero/notebook_preview/1": "missing authors",
-}
-
-
 def get_directory_size(path: Path):
     total_size = 0
     for dirpath, _, filenames in os.walk(path):
@@ -182,14 +50,15 @@ def test_rdf_format_to_populate_cache(
     key: str,
 ):
     """this test is redundant if `test_rdf` runs, but is used in the CI to populate the cache"""
+    from bioimageio.spec._internal.gh_utils import set_github_warning
+
     if os.environ.get("BIOIMAGEIO_POPULATE_CACHE") != "1":
         pytest.skip("BIOIMAGEIO_POPULATE_CACHE != 1")
 
-    if key in KNOWN_INVALID:
-        pytest.skip(KNOWN_INVALID[key])
-
-    if (cache_size := get_directory_size(settings.cache_path)) > 8e9:
-        pytest.skip(f"reached 8GB cache size limit ({cache_size / 1e9:.2f} GB)")
+    if (cache_size := get_directory_size(settings.cache_path)) > 7e9:
+        msg = f"Reached 7GB cache size limit ({cache_size / 1e9:.2f} GB)"
+        set_github_warning("Reached cache size limit", msg)
+        pytest.skip(msg)
 
     from bioimageio.core import load_description
 
@@ -206,9 +75,6 @@ def test_rdf(
     from bioimageio.spec import get_conda_env
     from bioimageio.spec.model import ModelDescr
 
-    if key in KNOWN_INVALID:
-        pytest.skip(KNOWN_INVALID[key])
-
     from bioimageio.core import load_description, load_description_and_test
 
     descr = load_description(
@@ -253,8 +119,7 @@ def depends_on(dep: str) -> bool:
     assert not isinstance(descr, InvalidDescr), descr.validation_summary.display() or [
         e.msg for e in descr.validation_summary.errors
     ]
-    assert (
-        descr.validation_summary.status == "passed"
-    ), descr.validation_summary.display() or [
-        e.msg for e in descr.validation_summary.errors
-    ]
+    assert descr.validation_summary.status == "passed", (
+        descr.validation_summary.display()
+        or [e.msg for e in descr.validation_summary.errors]
+    )
diff --git a/tests/test_bioimageio_spec_version.py b/tests/test_bioimageio_spec_version.py
index 2418baa5..cf10007b 100644
--- a/tests/test_bioimageio_spec_version.py
+++ b/tests/test_bioimageio_spec_version.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 import pytest
+from bioimageio.spec._internal.gh_utils import set_github_warning
 from packaging.version import Version
 
 
@@ -38,4 +39,8 @@ def test_bioimageio_spec_version(conda_cmd: Optional[str]):
 
     assert spec_ver.count(".") == 3
     pinned = Version(spec_ver)
-    assert pinned == released, "bioimageio.spec not pinned to the latest version"
+    if pinned != released:
+        set_github_warning(
+            "spec pin mismatch",
+            f"bioimageio.spec pinned to {pinned}, while latest version on conda-forge is {released}",
+        )
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index bd30f064..04779158 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -15,7 +15,7 @@
     load_model,
     predict,
 )
-from bioimageio.core.digest_spec import get_test_inputs, get_test_outputs
+from bioimageio.core.digest_spec import get_test_input_sample, get_test_output_sample
 from bioimageio.spec import AnyModelDescr
 
 
@@ -34,8 +34,8 @@ class Prep(NamedTuple):
 @pytest.fixture(scope="module")
 def prep(any_model: str):
     model = load_model(any_model, perform_io_checks=False)
-    input_sample = get_test_inputs(model)
-    output_sample = get_test_outputs(model)
+    input_sample = get_test_input_sample(model)
+    output_sample = get_test_output_sample(model)
     return Prep(model, create_prediction_pipeline(model), input_sample, output_sample)
 
 
diff --git a/tests/test_prediction_pipeline.py b/tests/test_prediction_pipeline.py
index 08e9f094..615d97ed 100644
--- a/tests/test_prediction_pipeline.py
+++ b/tests/test_prediction_pipeline.py
@@ -12,7 +12,10 @@ def _test_prediction_pipeline(
     model_package: Path, weights_format: SupportedWeightsFormat
 ):
     from bioimageio.core._prediction_pipeline import create_prediction_pipeline
-    from bioimageio.core.digest_spec import get_test_inputs, get_test_outputs
+    from bioimageio.core.digest_spec import (
+        get_test_input_sample,
+        get_test_output_sample,
+    )
 
     bio_model = load_description(model_package)
     assert isinstance(
@@ -22,10 +25,10 @@ def _test_prediction_pipeline(
         bioimageio_model=bio_model, weight_format=weights_format
     )
 
-    inputs = get_test_inputs(bio_model)
+    inputs = get_test_input_sample(bio_model)
     outputs = pp.predict_sample_without_blocking(inputs)
 
-    expected_outputs = get_test_outputs(bio_model)
+    expected_outputs = get_test_output_sample(bio_model)
     assert len(outputs.shape) == len(expected_outputs.shape)
     for m in expected_outputs.members:
         out = outputs.members[m].data
diff --git a/tests/test_prediction_pipeline_device_management.py b/tests/test_prediction_pipeline_device_management.py
index 0d2ff9b7..7c06bcb4 100644
--- a/tests/test_prediction_pipeline_device_management.py
+++ b/tests/test_prediction_pipeline_device_management.py
@@ -13,7 +13,10 @@ def _test_device_management(model_package: Path, weight_format: SupportedWeights
 
     from bioimageio.core import load_description
     from bioimageio.core._prediction_pipeline import create_prediction_pipeline
-    from bioimageio.core.digest_spec import get_test_inputs, get_test_outputs
+    from bioimageio.core.digest_spec import (
+        get_test_input_sample,
+        get_test_output_sample,
+    )
 
     if not hasattr(torch, "cuda") or torch.cuda.device_count() == 0:
         pytest.skip("Need at least one cuda device for this test")
@@ -24,11 +27,11 @@ def _test_device_management(model_package: Path, weight_format: SupportedWeights
         bioimageio_model=bio_model, weight_format=weight_format, devices=["cuda:0"]
     )
 
-    inputs = get_test_inputs(bio_model)
+    inputs = get_test_input_sample(bio_model)
     with pred_pipe as pp:
         outputs = pp.predict_sample_without_blocking(inputs)
 
-    expected_outputs = get_test_outputs(bio_model)
+    expected_outputs = get_test_output_sample(bio_model)
 
     assert len(outputs.shape) == len(expected_outputs.shape)
     for m in expected_outputs.members:
diff --git a/tests/test_proc_ops.py b/tests/test_proc_ops.py
index be87f54b..7ab2eaa0 100644
--- a/tests/test_proc_ops.py
+++ b/tests/test_proc_ops.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+import scipy  # pyright: ignore[reportMissingTypeStubs]
 import xarray as xr
 from typing_extensions import TypeGuard
 
@@ -375,3 +376,40 @@ def test_sigmoid(tid: MemberId):
 
     exp = xr.DataArray(1.0 / (1 + np.exp(-np_data)), dims=axes)
     xr.testing.assert_allclose(exp, sample.members[tid].data, rtol=1e-5, atol=1e-7)
+
+
+def test_softmax(tid: MemberId):
+    from bioimageio.core.proc_ops import Softmax
+
+    shape = (3, 32, 32)
+    axes = ("channel", "y", "x")
+    np_data = np.random.rand(*shape)
+    data = xr.DataArray(np_data, dims=axes)
+    sample = Sample(members={tid: Tensor.from_xarray(data)}, stat={}, id=None)
+    softmax = Softmax(tid, tid, axis=AxisId("channel"))
+    softmax(sample)
+
+    exp = xr.DataArray(
+        np.exp(np_data - np.max(np_data, axis=0, keepdims=True))
+        / np.sum(np.exp(np_data - np.max(np_data, axis=0, keepdims=True)), axis=0),
+        dims=axes,
+    )
+    xr.testing.assert_allclose(exp, sample.members[tid].data, rtol=1e-5, atol=1e-7)
+
+
+def test_softmax_with_scipy(tid: MemberId):
+    from bioimageio.core.proc_ops import Softmax
+
+    shape = (3, 32, 32)
+    axes = ("channel", "y", "x")
+    np_data = np.random.rand(*shape)
+    data = xr.DataArray(np_data, dims=axes)
+    sample = Sample(members={tid: Tensor.from_xarray(data)}, stat={}, id=None)
+    softmax = Softmax(tid, tid, axis=AxisId("channel"))
+    softmax(sample)
+
+    exp = xr.DataArray(
+        scipy.special.softmax(np_data, axis=0),
+        dims=axes,
+    )
+    xr.testing.assert_allclose(exp, sample.members[tid].data, rtol=1e-5, atol=1e-7)