ImageMarkup
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 12 additions & 11 deletions b/‎.github/workflows/release.yml‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎isic_challenge_scoring/__init__.py‎
Lines changed: 0 additions & 8 deletions b/‎isic_challenge_scoring/__init__.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎isic_challenge_scoring/classification.py‎
Lines changed: 18 additions & 16 deletions b/‎isic_challenge_scoring/classification.py‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎isic_challenge_scoring/confusion.py‎
Lines changed: 2 additions & 4 deletions b/‎isic_challenge_scoring/confusion.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎isic_challenge_scoring/load_csv.py‎
Lines changed: 4 additions & 4 deletions b/‎isic_challenge_scoring/load_csv.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎isic_challenge_scoring/load_image.py‎
Lines changed: 7 additions & 6 deletions b/‎isic_challenge_scoring/load_image.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎isic_challenge_scoring/metrics.py‎
Lines changed: 15 additions & 16 deletions b/‎isic_challenge_scoring/metrics.py‎
Lines changed: 15 additions & 16 deletions
@@ -9,13 +9,13 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.13']
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           lfs: true
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install tox
 
@@ -5,23 +5,24 @@ on:
 jobs:
   publish:
     runs-on: ubuntu-latest
+    environment: release
+    permissions:
+      id-token: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Tags are needed to compute the current version number
           fetch-depth: 0
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.x'
-      - name: Install tox
+          python-version: "3.x"
+      - name: Install Python build
         run: |
           pip install --upgrade pip
-          pip install tox
-      - name: Publish to PyPI
-        env:
-          TWINE_USERNAME: "__token__"
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-          TWINE_NON_INTERACTIVE: "true"
+          pip install build
+      - name: Build the Python distribution
         run: |
-          tox -e release
+          python -m build
+      - name: Publish the Python distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.13
 
 WORKDIR /usr/src/isic-challenge-scoring
 
 
@@ -7,7 +7,7 @@ Automated scoring code for the [ISIC Challenge](http://challenge.isic-archive.co
 
 ## Installation
 ### Python
-Python version >= 3.8 is required.
+Python version >= 3.13 is required.
 ```bash
 pip install isic-challenge-scoring
 ```
 
@@ -1,13 +1,5 @@
-from importlib.metadata import PackageNotFoundError, version
-
 from isic_challenge_scoring.classification import ClassificationMetric, ClassificationScore
 from isic_challenge_scoring.segmentation import SegmentationScore
 from isic_challenge_scoring.types import ScoreError
 
 __all__ = ['ClassificationScore', 'SegmentationScore', 'ScoreError', 'ClassificationMetric']
-
-try:
-    __version__ = version('isic-challenge-scoring')
-except PackageNotFoundError:
-    # package is not installed
-    pass
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 import enum
 import pathlib
-from typing import Dict, TextIO, cast
+from typing import TextIO, cast
 
 import pandas as pd
 
@@ -23,7 +23,7 @@ class ClassificationMetric(enum.Enum):
 class ClassificationScore(Score):
     per_category: pd.DataFrame
     macro_average: pd.Series
-    rocs: Dict[str, pd.DataFrame]
+    rocs: dict[str, pd.DataFrame]
     aggregate: pd.Series
 
     def __init__(
@@ -40,28 +40,29 @@ def __init__(
                 self._category_score(
                     truth_probabilities[category],
                     prediction_probabilities[category],
-                    truth_weights.score_weight,
+                    truth_weights['score_weight'],
                     category,
                 )
                 for category in categories
             ]
         )
-        self.macro_average = self.per_category.mean(axis='index').rename(
-            'macro_average', inplace=True
-        )
+        # TODO: Fixed by https://github.com/pandas-dev/pandas-stubs/pull/1105
+        self.macro_average = self.per_category.mean(  # type: ignore[assignment]
+            axis='index'
+        ).rename('macro_average', inplace=True)
         self.rocs = {
             category: metrics.roc(
                 truth_probabilities[category],
                 prediction_probabilities[category],
-                truth_weights.score_weight,
+                truth_weights['score_weight'],
             )
             for category in categories
         }
         # Multi-category aggregate metrics
         self.aggregate = pd.Series(
             {
                 'balanced_accuracy': metrics.balanced_multiclass_accuracy(
-                    truth_probabilities, prediction_probabilities, truth_weights.score_weight
+                    truth_probabilities, prediction_probabilities, truth_weights['score_weight']
                 )
             },
             index=['balanced_accuracy'],
@@ -71,29 +72,29 @@ def __init__(
         if target_metric == ClassificationMetric.BALANCED_ACCURACY:
             self.overall = self.aggregate.at['balanced_accuracy']
             self.validation = metrics.balanced_multiclass_accuracy(
-                truth_probabilities, prediction_probabilities, truth_weights.validation_weight
+                truth_probabilities, prediction_probabilities, truth_weights['validation_weight']
             )
         elif target_metric == ClassificationMetric.AVERAGE_PRECISION:
-            self.overall = self.macro_average['ap']
+            self.overall = self.macro_average.at['ap']
             per_category_ap = pd.Series(
                 [
                     metrics.average_precision(
                         truth_probabilities[category],
                         prediction_probabilities[category],
-                        truth_weights.validation_weight,
+                        truth_weights['validation_weight'],
                     )
                     for category in categories
                 ]
             )
             self.validation = per_category_ap.mean()
         elif target_metric == ClassificationMetric.AUC:
-            self.overall = self.macro_average['auc']
+            self.overall = self.macro_average.at['auc']
             per_category_auc = pd.Series(
                 [
                     metrics.auc(
                         truth_probabilities[category],
                         prediction_probabilities[category],
-                        truth_weights.validation_weight,
+                        truth_weights['validation_weight'],
                     )
                     for category in categories
                 ]
@@ -212,9 +213,10 @@ def from_file(
         prediction_file: pathlib.Path,
         target_metric: ClassificationMetric,
     ) -> ClassificationScore:
-        with truth_file.open('r') as truth_file_stream, prediction_file.open(
-            'r'
-        ) as prediction_file_stream:
+        with (
+            truth_file.open('r') as truth_file_stream,
+            prediction_file.open('r') as prediction_file_stream,
+        ):
             return cls.from_stream(
                 truth_file_stream,
                 prediction_file_stream,
 
@@ -1,14 +1,12 @@
-from typing import Optional, Tuple, Union
-
 import numpy as np
 import pandas as pd
 
 
 def create_binary_confusion_matrix(
     truth_binary_values: np.ndarray,
     prediction_binary_values: np.ndarray,
-    weights: Optional[np.ndarray] = None,
-    name: Optional[Union[str, Tuple[str, ...]]] = None,
+    weights: np.ndarray | None = None,
+    name: str | tuple[str, ...] | None = None,
 ) -> pd.Series:
     # This implementation is:
     # ~30x faster than sklearn.metrics.confusion_matrix
 
@@ -1,12 +1,12 @@
-from typing import TextIO, Tuple
+from typing import TextIO
 
 import numpy as np
 import pandas as pd
 
 from isic_challenge_scoring.types import ScoreError
 
 
-def parse_truth_csv(csv_file_stream: TextIO) -> Tuple[pd.DataFrame, pd.DataFrame]:
+def parse_truth_csv(csv_file_stream: TextIO) -> tuple[pd.DataFrame, pd.DataFrame]:
     table = pd.read_csv(csv_file_stream, header=0)
 
     table.set_index('image', drop=True, inplace=True, verify_integrity=False)
@@ -87,7 +87,7 @@ def parse_csv(csv_file_stream: TextIO, categories: pd.Index) -> pd.DataFrame:
     # TODO: identify specific failed rows
 
     out_of_range_rows = probabilities[
-        probabilities.applymap(lambda x: x < 0.0 or x > 1.0).any(axis='columns')
+        probabilities.map(lambda x: x < 0.0 or x > 1.0).any(axis='columns')
     ].index
     if not out_of_range_rows.empty:
         raise ScoreError(
@@ -120,4 +120,4 @@ def validate_rows(
 
 def sort_rows(probabilities: pd.DataFrame) -> None:
     """Sort rows by labels, in-place."""
-    probabilities.sort_index(axis='rows', inplace=True)
+    probabilities.sort_index(axis='index', inplace=True)
@@ -1,7 +1,8 @@
+from collections.abc import Generator
 from dataclasses import dataclass, field
 import pathlib
 import re
-from typing import Generator, Match, Optional, Set
+from re import Match
 
 from PIL import Image, UnidentifiedImageError
 import numpy as np
@@ -16,15 +17,15 @@ class ImagePair:
     prediction_file: pathlib.Path = field(init=False)
     prediction_image: np.ndarray = field(init=False)
     image_id: str = field(init=False)
-    attribute_id: Optional[str] = field(default=None, init=False)
+    attribute_id: str | None = field(default=None, init=False)
 
     def parse_image_id(self) -> None:
-        image_id_match: Optional[Match[str]] = re.search(r'ISIC_[0-9]{7}', self.truth_file.stem)
+        image_id_match: Match[str] | None = re.search(r'ISIC_[0-9]{7}', self.truth_file.stem)
         if not image_id_match:
             raise Exception(f'Unknown ground truth file: {self.truth_file.name}.')
         self.image_id = image_id_match.group(0)
 
-        attribute_id_match: Optional[Match[str]] = re.search(
+        attribute_id_match: Match[str] | None = re.search(
             r'attribute_([a-z_]+)', self.truth_file.stem
         )
         if attribute_id_match:
@@ -93,7 +94,7 @@ def load_segmentation_image(image_path: pathlib.Path) -> np.ndarray:
 
 def assert_binary_image(image: np.ndarray, image_path: pathlib.Path) -> np.ndarray:
     """Ensure a NumPy array image is binary, correcting if possible."""
-    image_values: Set[int] = set(np.unique(image))
+    image_values: set[int] = set(np.unique(image))
     if image_values <= {0, 255}:
         # Expected values
         pass
@@ -112,7 +113,7 @@ def assert_binary_image(image: np.ndarray, image_path: pathlib.Path) -> np.ndarr
 
 def iter_image_pairs(
     truth_path: pathlib.Path, prediction_path: pathlib.Path
-) -> Generator[ImagePair, None, None]:
+) -> Generator[ImagePair]:
     for truth_file in sorted(truth_path.iterdir()):
         if truth_file.name in {'ATTRIBUTION.txt', 'LICENSE.txt'}:
             continue
 
@@ -1,4 +1,3 @@
-from typing import Tuple
 import warnings
 
 import numpy as np
@@ -12,7 +11,7 @@ def _to_labels(probabilities: pd.DataFrame) -> pd.Series:
 
     # Find places where there are multiple maximum values
     max_probabilities = probabilities.max(axis='columns')
-    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows')
+    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='index')
     number_of_max: pd.Series = is_max.sum(axis='columns')
     multiple_max: pd.Series = number_of_max.gt(1)
     # Set those locations as an 'undecided' label
@@ -60,7 +59,7 @@ def _roc_curve(
     prediction_probabilities: pd.Series,
     weights: pd.Series,
     drop_intermediate: bool = True,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Call sklearn.metrics.roc_curve in a more performant way."""
     # This is much faster to compute if the zero-weighted probabilities are eliminated first
     nonzero_weights = weights.ne(0.0)
@@ -69,15 +68,18 @@ def _roc_curve(
     weights = weights[nonzero_weights]
 
     # An additional minor optimization
-    if weights.eq(1.0).all():
-        weights = None
+    sample_weight = None if weights.eq(1.0).all() else weights
 
     fp_rates, tp_rates, thresholds = sklearn.metrics.roc_curve(
         truth_probabilities,
         prediction_probabilities,
-        sample_weight=weights,
+        sample_weight=sample_weight,
         drop_intermediate=drop_intermediate,
     )
+    # This can contain infinity values so replace them with 1.0.
+    # https://github.com/scikit-learn/scikit-learn/pull/26194
+    thresholds = np.nan_to_num(thresholds, posinf=1.0)
+
     return fp_rates, tp_rates, thresholds
 
 
@@ -131,8 +133,8 @@ def binary_threshold_jaccard(cm: pd.Series, threshold: float = 0.65) -> float:
 
 def binary_dice(cm: pd.Series) -> float:
     if cm.at['TP'] + cm.at['FP'] + cm.at['FN'] == 0:
-        # Dice is ill-defined if all are negative and the prediction is perfect, but we'll
-        # just score that as a perfect answer
+        # Dice / F1 is ill-defined if all are negative and the prediction is perfect.
+        # See the rationale in "binary_ppv", which also applies here.
         return 1.0
     else:
         return (2 * cm.at['TP']) / ((2 * cm.at['TP']) + cm.at['FP'] + cm.at['FN'])
@@ -152,11 +154,8 @@ def binary_ppv(cm: pd.Series) -> float:
 
 def binary_npv(cm: pd.Series) -> float:
     if cm.at['TN'] + cm.at['FN'] == 0:
-        # NPV is ill-defined if all predictions are positive; we'll score it as perfect, which
-        # doesn't penalize the case where all are truly positive (a good predictor), and is sane
-        # for the case where some are truly negative (a limitation of this metric)
-        # Note, some other implementations would score the latter case as 0:
-        # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
+        # NPV is ill-defined if all predictions are positive.
+        # See the rationale in "binary_ppv", which also applies here.
         return 1.0
     else:
         return cm.at['TN'] / (cm.at['TN'] + cm.at['FN'])
@@ -168,7 +167,7 @@ def auc(
     auc = sklearn.metrics.roc_auc_score(
         truth_probabilities, prediction_probabilities, sample_weight=weights
     )
-    return auc
+    return float(auc)
 
 
 def auc_above_sensitivity(
@@ -227,7 +226,7 @@ def auc_above_sensitivity(
     fp_rates_segment = np.insert(fp_rates_segment, 0, fp_rate_threshold)
 
     partial_auc = sklearn.metrics.auc(fp_rates_segment, tp_rates_segment)
-    return partial_auc
+    return float(partial_auc)
 
 
 def average_precision(
@@ -242,7 +241,7 @@ def average_precision(
         ap = sklearn.metrics.average_precision_score(
             truth_probabilities, prediction_probabilities, sample_weight=weights
         )
-    return ap
+    return float(ap)
 
 
 def roc(
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.8`
	`1`	`+FROM python:3.13`
`2`	`2`
`3`	`3`	`WORKDIR /usr/src/isic-challenge-scoring`
`4`	`4`