update mentions indexing

Aethor · Aethor · commit d4111562e093 · 2023-07-04T14:42:14.000+02:00
diff --git a/tibert/bertcoref.py b/tibert/bertcoref.py
@@ -158,7 +158,7 @@ def prepared_document(
                 except ValueError:
                     continue
                 new_chain.append(
-                    Mention(tokens[start_idx : end_idx + 1], start_idx, end_idx)
+                    Mention(tokens[start_idx : end_idx + 1], start_idx, end_idx + 1)
                 )
             if len(new_chain) > 0:
                 new_chains.append(new_chain)
@@ -193,7 +193,7 @@ def from_wpieced_to_tokenized(
                 new_end_idx = wp_to_token[mention.end_idx]
                 new_chain.append(
                     Mention(
-                        tokens[new_start_idx : new_end_idx + 1],
+                        tokens[new_start_idx:new_end_idx],
                         new_start_idx,
                         new_end_idx,
                     )
@@ -227,7 +227,7 @@ def from_labels(
                 # singleton cluster
                 if mention_labels[i] == 1:
                     start_idx, end_idx = spans_idx[i]
-                    mention_tokens = tokens[start_idx : end_idx + 1]
+                    mention_tokens = tokens[start_idx:end_idx]
                     chains.append([Mention(mention_tokens, start_idx, end_idx)])
                     already_visited_mentions.append(i)
 
@@ -237,7 +237,7 @@ def from_labels(
                 continue
 
             start_idx, end_idx = spans_idx[i]
-            mention_tokens = tokens[start_idx : end_idx + 1]
+            mention_tokens = tokens[start_idx:end_idx]
             chain = [Mention(mention_tokens, start_idx, end_idx)]
 
             for j, label in enumerate(mlabels):
@@ -246,7 +246,7 @@ def from_labels(
                     continue
 
                 start_idx, end_idx = spans_idx[j]
-                mention_tokens = tokens[start_idx : end_idx + 1]
+                mention_tokens = tokens[start_idx:end_idx]
                 chain.append(Mention(mention_tokens, start_idx, end_idx))
                 already_visited_mentions.append(j)
 
@@ -426,9 +426,9 @@ def from_conll2012_file(
 
                     if mention_is_ending:
                         mention_start_idx = open_mentions[chain_id].pop()
-                        mention_end_idx = len(document_tokens) - 1
+                        mention_end_idx = len(document_tokens)
                         mention = Mention(
-                            document_tokens[mention_start_idx : mention_end_idx + 1],
+                            document_tokens[mention_start_idx:mention_end_idx],
                             mention_start_idx,
                             mention_end_idx,
                         )
@@ -665,7 +665,7 @@ def coreference_documents(
                 # the antecedent is the dummy mention : maybe we have
                 # a one-mention chain ?
                 if top_antecedent_idx == antecedents_nb - 1:
-                    if self.top_mentions_scores[b_i][m_j].item() > 0.0:
+                    if float(self.top_mentions_scores[b_i][m_j].item()) > 0.0:
                         G.add_node(span_mention)
                     continue
 
@@ -907,7 +907,7 @@ def distance_between_spans(self, spans_nb: int, seq_size: int) -> torch.Tensor:
 
         # distance between a span and its antecedent is defined to be
         # the span start index minus the antecedent span end index
-        dist = start_end_idx_combinations[:, 0] - start_end_idx_combinations[:, 1]
+        dist = start_end_idx_combinations[:, 0] - start_end_idx_combinations[:, 1] + 1
         assert dist.shape == (p * p,)
         dist = dist.reshape(spans_nb, spans_nb)
 
diff --git a/tibert/utils.py b/tibert/utils.py
@@ -37,7 +37,7 @@ def spans_indexs(seq: List, max_len: int) -> List[Tuple[int, int]]:
     for i in range(1, min(len(seq), max_len + 1)):
         for span in windowed(range(len(seq)), i):
             span = cast(Tuple[int, ...], span)
-            indexs.append((min(span), max(span)))
+            indexs.append((min(span), max(span) + 1))
     return indexs
 
 
@@ -124,7 +124,7 @@ def pprint_coreference_document(doc: CoreferenceDocument):
         related_chains = [
             (chain_i, start_i, end_i)
             for chain_i, start_i, end_i in mentions
-            if start_i == token_i or end_i == token_i
+            if start_i == token_i or end_i - 1 == token_i
         ]
 
         for chain_i, start_i, _ in related_chains:
@@ -134,7 +134,7 @@ def pprint_coreference_document(doc: CoreferenceDocument):
         out.append(token)
 
         for chain_i, _, end_i in related_chains:
-            if token_i == end_i:
+            if token_i == end_i - 1:
                 out.append(f")[/red]")
 
     try: