Skip to content

Commit 035e85a

Browse files
committed
fix a possible crash in the model output parsing
1 parent b1d77bd commit 035e85a

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

tibert/bertcoref.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,14 @@ def coreference_documents(
872872
G = nx.Graph()
873873
for m_j in range(top_mentions_nb):
874874
span_i = int(self.top_mentions_index[b_i][m_j].item())
875-
span_coords = spans_idx[span_i]
875+
# it is possible to have a top span that does not
876+
# actually exist in a batch sample. This is because
877+
# padding is done on wordpieces but not on words. In
878+
# that case, we simply ignore that predicted span.
879+
try:
880+
span_coords = spans_idx[span_i]
881+
except IndexError:
882+
continue
876883

877884
mention_score = float(self.mentions_scores[b_i][span_i].item())
878885
span_mention = Mention(

tibert/predict.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def tensorize_chains(
205205
return CoreferenceDocument(merged_left.tokens + merged_right.tokens, new_chains)
206206

207207

208-
def _stream_predict_wpieced_coref_raw(
208+
def _stream_predict_coref_raw(
209209
documents: List[Union[str, List[str]]],
210210
model: BertForCoreferenceResolution,
211211
tokenizer: PreTrainedTokenizerFast,
@@ -292,7 +292,7 @@ def stream_predict_coref(
292292
:return: a list of ``CoreferenceDocument``, with annotated
293293
coreference chains.
294294
"""
295-
for out_docs, _ in _stream_predict_wpieced_coref_raw(
295+
for out_docs, _ in _stream_predict_coref_raw(
296296
documents, model, tokenizer, batch_size, quiet, device_str, lang
297297
):
298298
for out_doc in out_docs:
@@ -332,7 +332,7 @@ def predict_coref(
332332
if len(documents) == 0:
333333
return None
334334

335-
for out_docs, out in _stream_predict_wpieced_coref_raw(
335+
for out_docs, out in _stream_predict_coref_raw(
336336
documents,
337337
model,
338338
tokenizer,

0 commit comments

Comments
 (0)