Fixes the "wrong" computation of the AP score.

Pandoro · Pandoro · commit 315ba4cca6eb · 2018-06-21T15:12:26.000+02:00
Given that sklearn changed the way AP scores are computed this implements a custom version.
This implementation follows the official Market-1501 computation of the AP.
diff --git a/README.md b/README.md
@@ -273,6 +273,9 @@ The evaluation code in this repository simply uses the scikit-learn code, and th
 Unfortunately, almost no paper mentions which code-base they used and how they computed `mAP` scores, so comparison is difficult.
 Other frameworks have [the same problem](https://github.com/Cysu/open-reid/issues/50), but we expect many not to be aware of this.
 
+To make the evaluating code independent of the sklearn version we have implemented our own version of the average precision computation.
+This now follows the official Market1501 code and results in values directly comparable.
+
 # Independent re-implementations
 
 These are the independent re-implementations of our paper that we are aware of,
diff --git a/evaluate.py b/evaluate.py
@@ -7,7 +7,6 @@
 import h5py
 import json
 import numpy as np
-from sklearn.metrics import average_precision_score
 import tensorflow as tf
 
 import common
@@ -52,6 +51,43 @@
     help='Batch size used during evaluation, adapt based on your memory usage.')
 
 
+def average_precision_score(y_true, y_score):
+    """ Compute average precision (AP) from prediction scores.
+
+    This is a replacement for the scikit-learn version which, while likely more
+    correct does not follow the same protocol as used in the default Market-1501
+    evaluation that first introduced this score to the person ReID field.
+
+    Args:
+        y_true (array): The binary labels for all data points.
+        y_score (array): The predicted scores for each samples for all data
+            points.
+
+    Raises:
+        ValueError if the length of the labels and scores do not match.
+
+    Returns:
+        A float representing the average precision given the predictions.
+    """
+
+    if len(y_true) != len(y_score):
+        raise ValueError('The length of the labels and predictions must match '
+                         'got lengths y_true:{} and y_score:{}'.format(
+                            len(y_true), len(y_score)))
+
+    y_true_sorted = y_true[np.argsort(-y_score, kind='mergesort')]
+
+    tp = np.cumsum(y_true_sorted)
+    total_true = np.sum(y_true_sorted)
+    recall = tp / total_true
+    recall = np.insert(recall, 0, 0.)
+    precision = tp / np.arange(1, len(tp) + 1)
+    precision = np.insert(precision, 0, 1.)
+    ap = np.sum(np.diff(recall) * ((precision[1:] + precision[:-1]) / 2))
+
+    return ap
+
+
 def main():
     # Verify that parameters are set correctly.
     args = parser.parse_args()