Inconsistency between your implementation of AUPRO and gudovskiy/cflow-ad implementation

Dear authors,

First of all, thank you for your great work and for open-sourcing your code. I recently came across the implementation of AUPRO you have provided and compared it with another commonly used implementation available at https://github.com/zqhang/AnomalyCLIP/blob/071263c1c7c1dd2a0edc5173c6e651a4621da728/metrics.py#L5. Here is the the python snippet for this purpose: 

```
from sklearn.metrics import auc, roc_auc_score, average_precision_score, f1_score, precision_recall_curve

import numpy as np
from skimage import measure
import torch
from torchmetrics import AUROC, AveragePrecision
import time

# ref: https://github.com/gudovskiy/cflow-ad/blob/master/train.py
def cal_pro_score(masks, amaps, max_step=200, expect_fpr=0.3):
    binary_amaps = np.zeros_like(amaps, dtype=bool)
    min_th, max_th = amaps.min(), amaps.max()
    delta = (max_th - min_th) / max_step
    pros, fprs, ths = [], [], []
    for th in np.arange(min_th, max_th, delta):
        binary_amaps[amaps <= th], binary_amaps[amaps > th] = 0, 1
        pro = []
        for binary_amap, mask in zip(binary_amaps, masks):
            for region in measure.regionprops(measure.label(mask)):
                tp_pixels = binary_amap[region.coords[:, 0], region.coords[:, 1]].sum()
                pro.append(tp_pixels / region.area)
        inverse_masks = 1 - masks
        fp_pixels = np.logical_and(inverse_masks, binary_amaps).sum()
        fpr = fp_pixels / inverse_masks.sum()
        pros.append(np.array(pro).mean())
        fprs.append(fpr)
        ths.append(th)
    pros, fprs, ths = np.array(pros), np.array(fprs), np.array(ths)
    idxes = fprs < expect_fpr
    fprs = fprs[idxes]
    fprs = (fprs - fprs.min()) / (fprs.max() - fprs.min())
    pro_auc = auc(fprs, pros[idxes])
    return pro_auc

# ref: https://github.com/M-3LAB/open-iad/blob/main/metric/mvtec3d/au_pro.py#L205
import numpy as np
from scipy.ndimage import label
from bisect import bisect
__all__ = ['GroundTruthComponent', 'trapezoid', 'collect_anomaly_scores', 'compute_pro', 'calculate_au_pro']

class GroundTruthComponent:
    def __init__(self, anomaly_scores):
        self.anomaly_scores = anomaly_scores.copy()
        self.anomaly_scores.sort()
        self.index = 0
        self.last_threshold = None

    def compute_overlap(self, threshold):
        if self.last_threshold is not None:
            assert self.last_threshold <= threshold
        while self.index < len(self.anomaly_scores) and self.anomaly_scores[self.index] <= threshold:
            self.index += 1
        return 1.0 - self.index / len(self.anomaly_scores)

def trapezoid(x, y, x_max=None):
    x = np.array(x)
    y = np.array(y)
    finite_mask = np.logical_and(np.isfinite(x), np.isfinite(y))
    if not finite_mask.all():
        print("""WARNING: Not all x and y values passed to trapezoid are finite. Will continue with only the finite values.""")
    x = x[finite_mask]
    y = y[finite_mask]
    correction = 0.0
    if x_max is not None:
        if x_max not in x:
            ins = bisect(x, x_max)
            assert 0 < ins < len(x)
            y_interp = y[ins - 1] + ((y[ins] - y[ins - 1]) * (x_max - x[ins - 1]) / (x[ins] - x[ins - 1]))
            correction = 0.5 * (y_interp + y[ins - 1]) * (x_max - x[ins - 1])
        mask = x <= x_max
        x = x[mask]
        y = y[mask]
    return np.sum(0.5 * (y[1:] + y[:-1]) * (x[1:] - x[:-1])) + correction

def collect_anomaly_scores(anomaly_maps, ground_truth_maps):
    assert len(anomaly_maps) == len(ground_truth_maps)
    ground_truth_components = []
    anomaly_scores_ok_pixels = np.zeros(len(ground_truth_maps) * ground_truth_maps[0].size)
    structure = np.ones((3, 3), dtype=int)
    ok_index = 0
    for gt_map, prediction in zip(ground_truth_maps, anomaly_maps):
        labeled, n_components = label(gt_map, structure)
        num_ok_pixels = len(prediction[labeled == 0])
        anomaly_scores_ok_pixels[ok_index:ok_index + num_ok_pixels] = prediction[labeled == 0].copy()
        ok_index += num_ok_pixels
        for k in range(n_components):
            component_scores = prediction[labeled == (k + 1)]
            ground_truth_components.append(GroundTruthComponent(component_scores))
    anomaly_scores_ok_pixels = np.resize(anomaly_scores_ok_pixels, ok_index)
    anomaly_scores_ok_pixels.sort()
    return ground_truth_components, anomaly_scores_ok_pixels

def compute_pro(anomaly_maps, ground_truth_maps, num_thresholds):
    ground_truth_components, anomaly_scores_ok_pixels = collect_anomaly_scores(anomaly_maps, ground_truth_maps)
    threshold_positions = np.linspace(0, len(anomaly_scores_ok_pixels) - 1, num=num_thresholds, dtype=int)
    fprs = [1.0]
    pros = [1.0]
    for pos in threshold_positions:
        threshold = anomaly_scores_ok_pixels[pos]
        fpr = 1.0 - (pos + 1) / len(anomaly_scores_ok_pixels)
        pro = 0.0
        for component in ground_truth_components:
            pro += component.compute_overlap(threshold)
        pro /= len(ground_truth_components)
        fprs.append(fpr)
        pros.append(pro)
    fprs = fprs[::-1]
    pros = pros[::-1]
    return fprs, pros

def calculate_au_pro(gts, predictions, integration_limit=0.3, num_thresholds=200):
    # Compute the PRO curve.
    pro_curve = compute_pro(anomaly_maps=predictions, ground_truth_maps=gts, num_thresholds=num_thresholds)

    # Compute the area under the PRO curve.
    au_pro = trapezoid(pro_curve[0], pro_curve[1], x_max=integration_limit)
    au_pro /= integration_limit

    # Return the evaluation metrics.
    return au_pro, pro_curve

def test_pro_score(masks, amaps):
    start_gudovskiy = time.time()
    pro_auc_gudovskiy =  cal_pro_score(masks, amaps,  max_step=200, expect_fpr=0.3)
    end_gudovskiy = time.time()
    gudovskiy_duration = end_gudovskiy - start_gudovskiy
    print(f"gudovskiy execution time: {gudovskiy_duration:.4f} seconds")

    start_openiad = time.time()
    pro_auc_openiad = calculate_au_pro(masks, amaps, integration_limit=0.3, num_thresholds=200)[0]
    end_openiad = time.time()
    openiad_duration = end_openiad - start_openiad
    print(f"openiad execution time: {openiad_duration:.4f} seconds")
    
    # assert np.isclose(pro_auc_cpu, pro_auc_openiad), f"Results differ: gudovskiy={pro_auc_cpu}, openiad={pro_auc_openiad}"
    print(f"Test passed: gudovskiy={pro_auc_cpu}, openiad={pro_auc_openiad}")

if __name__ == "__main__":
    # Example usage (with small random data for testing)
    num_sam = 25
    device='0
        
    masks = np.random.randint(0, 2, (num_sam, 256, 256))  # Binary masks
    amaps = np.random.rand(num_sam, 256, 256)  # Anomaly maps
    
    test_pro_score(masks, amaps)
    
    masks = np.random.randint(0, 2, (num_sam, 64, 64))  # Binary masks
    amaps = np.random.rand(num_sam, 64, 64)  # Anomaly maps
    
    test_pro_score(masks, amaps)
```
The results of this comparison were quite interesting.
In particular, your provided implementation is orders of magnitude faster than the alternative. However, I noticed that the evaluation outputs show inconsistencies across different runs.

Here are some sample comparison outputs:
run 1
```
gudovskiy execution time: 107.9296 seconds
openiad execution time: 1.0519 seconds
Test passed: gudovskiy =0.1527266103452964, openiad=0.15258837447609364
gudovskiy execution time: 10.1126 seconds
openiad execution time: 0.0717 seconds
Test passed: gudovskiy =0.1431284037814238, openiad=0.14169722566543597
```

run 2
```
gudovskiy execution time: 90.9615 seconds
openiad execution time: 1.0264 seconds
Test passed: gudovskiy=0.15251654329367212, openiad=0.15226420621476355
gudovskiy execution time: 8.4566 seconds
openiad execution time: 0.0695 seconds
Test passed: gudovskiy=0.15303256113352123, openiad=0.15058530542983684
```

run 3
```
gudovskiy execution time: 95.9508 seconds
openiad execution time: 0.9868 seconds
Test passed: gudovskiy=0.15314727468200867, openiad=0.15087067803750034
gudovskiy execution time: 8.0690 seconds
openiad execution time: 0.1248 seconds
Test passed: gudovskiy=0.17340418943755034, openiad=0.17253984597942126
```

Although the differences are generally small, they seem to vary slightly across runs even under the same settings.
I wonder if this could be due to non-deterministic behavior somewhere in the evaluation pipeline (e.g., floating-point precision issues, sorting instability, or threshold discretization)?

Would you happen to have any insight into what might cause these inconsistencies? Or if you are not the author, then do you know any way contact the authors? Given how much faster this implementation is, it has the potential to become the de-facto tool for AUPRO evaluation, especially if such inconsistencies can be clarified or minimized.

Thanks again for your time and I look forward to hearing your thoughts!

Regards,
A. Salehi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inconsistency between your implementation of AUPRO and gudovskiy/cflow-ad implementation #12

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Inconsistency between your implementation of AUPRO and gudovskiy/cflow-ad implementation #12

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions