fix a possible crash in the model output parsing

Aethor · Aethor · commit 035e85a98c0a · 2024-08-28T18:09:46.000+02:00
diff --git a/tibert/bertcoref.py b/tibert/bertcoref.py
@@ -872,7 +872,14 @@ def coreference_documents(
             G = nx.Graph()
             for m_j in range(top_mentions_nb):
                 span_i = int(self.top_mentions_index[b_i][m_j].item())
-                span_coords = spans_idx[span_i]
+                # it is possible to have a top span that does not
+                # actually exist in a batch sample. This is because
+                # padding is done on wordpieces but not on words. In
+                # that case, we simply ignore that predicted span.
+                try:
+                    span_coords = spans_idx[span_i]
+                except IndexError:
+                    continue
 
                 mention_score = float(self.mentions_scores[b_i][span_i].item())
                 span_mention = Mention(
diff --git a/tibert/predict.py b/tibert/predict.py
@@ -205,7 +205,7 @@ def tensorize_chains(
     return CoreferenceDocument(merged_left.tokens + merged_right.tokens, new_chains)
 
 
-def _stream_predict_wpieced_coref_raw(
+def _stream_predict_coref_raw(
     documents: List[Union[str, List[str]]],
     model: BertForCoreferenceResolution,
     tokenizer: PreTrainedTokenizerFast,
@@ -292,7 +292,7 @@ def stream_predict_coref(
     :return: a list of ``CoreferenceDocument``, with annotated
              coreference chains.
     """
-    for out_docs, _ in _stream_predict_wpieced_coref_raw(
+    for out_docs, _ in _stream_predict_coref_raw(
         documents, model, tokenizer, batch_size, quiet, device_str, lang
     ):
         for out_doc in out_docs:
@@ -332,7 +332,7 @@ def predict_coref(
         if len(documents) == 0:
             return None
 
-        for out_docs, out in _stream_predict_wpieced_coref_raw(
+        for out_docs, out in _stream_predict_coref_raw(
             documents,
             model,
             tokenizer,