[wip] coref processing

Jemoka · Jemoka · commit e1dd0b1b38d5 · 2025-06-25T21:03:19.000-07:00
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -379,23 +379,36 @@ def infer(self, raw_words, sent_ids) -> CorefResult:
         cased_words_new = []
         sent_id_new = []
 
+        # word to id map
+        # because inserted new nodes bump ids forward
+        coref_id_to_real_id_map = {}
+        
         for indx,(i,j) in enumerate(zip(raw_words,
                                         ((zeros_preds[word_start] > 0.5)
                                          .squeeze(-1)
                                          .tolist()))):
             if j:
+                coref_id_to_real_id_map[len(cased_words_new)] = (indx-1) + 0.5
+                coref_id_to_real_id_map[len(cased_words_new)+1] = indx
                 cased_words_new.extend(["_", i])
                 sent_id_new.extend([sent_ids[indx]]*2)
             else:
+                coref_id_to_real_id_map[len(cased_words_new)] = indx
                 cased_words_new.append(i)
                 sent_id_new.append(sent_ids[indx])
 
-        return self.run(self.build_doc({
+        results = self.run(self.build_doc({
             "document_id": "wb_doc_1",
             "cased_words": cased_words_new,
             "sent_id": sent_id_new
         }))
 
+        return {
+            "result": results,
+            "id_mapping": coref_id_to_real_id_map
+        }
+        
+
 
     def run(self,  # pylint: disable=too-many-locals
             doc: Doc,
diff --git a/stanza/pipeline/coref_processor.py b/stanza/pipeline/coref_processor.py
@@ -93,6 +93,9 @@ def process(self, document):
                 word_pos.append(word_idx)
 
         results = self._model.infer(cased_words, sent_ids)
+        id_mapping = results["id_mapping"]
+        results = results["result"]
+        
         clusters = []
         for span_cluster in results.span_clusters:
             if len(span_cluster) == 0:
@@ -137,12 +140,13 @@ def process(self, document):
                 sent_id = sent_ids[span[0]]
                 start_word = word_pos[span[0]]
                 end_word = word_pos[span[1]-1] + 1
-                mentions.append(CorefMention(sent_id, start_word, end_word))
+                mentions.append(CorefMention(sent_id, id_mapping[start_word], id_mapping[end_word]))
             representative = mentions[best_span]
             representative_text = extract_text(document, representative.sentence, representative.start_word, representative.end_word)
 
             chain = CorefChain(len(clusters), mentions, representative_text, best_span)
             clusters.append(chain)
 
+        breakpoint()
         document.coref = clusters
         return document