diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index fdce86316878..4aaba04b40f1 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -3459,7 +3459,8 @@ def evaluation_loop(
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
             # all_labels maybe is a tuple when prediction_steps output label_mask
-            batch_labels = all_labels[0] if isinstance(all_labels, (list, tuple)) else all_labels
+            # batch_labels = all_labels[0] if isinstance(all_labels, (list, tuple)) else all_labels
+            batch_labels = all_labels
             metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=batch_labels))
         else:
             metrics = {}