Fix handling of small docs in coref (#28)

polm · web-flow · commit 98b00ea835c1 · 2022-11-02T13:37:42.000+09:00
* Fix handling of small docs in coref

Docs with one or zero tokens fail in the coref component. This doesn't
have a fix yet, just a failing test. (There is also a test for the span
resolver, which does not fail.)

* Add example short doc to tests

It might be better to include this optionally? On the other hand, since
it should just be ignored in training, having it always there is more
thorough.

* Skip short docs

There can be no coref prediction for docs with one token (or no tokens).
Attempting to treat docs like that normally causes a mess with size
inference, so instead they're skipped.

In training, this just involves skipping the docs in the update step.
This is simple due to the fake batching structure, since the batch
doesn't have to be maintained.

In inference, this just involves short-circuiting to an empty
prediction.

* Clean up retokenization test

The retokenization test is hard-coded to the the training example
because it manually merges some tokens, to make sure that the prediction
and merge line up. It would probably be better to separate out the
training data from the general example here, but for now narrowing the
training data works.
diff --git a/spacy_experimental/coref/coref_component.py b/spacy_experimental/coref/coref_component.py
@@ -145,6 +145,11 @@ def predict(self, docs: Iterable[Doc]) -> List[MentionClusters]:
         """
         out = []
         for doc in docs:
+            if len(doc) < 2:
+                # no coref in docs with 0 or 1 token
+                out.append([])
+                continue
+
             scores, idxs = self.model.predict([doc])
             # idxs is a list of mentions (start / end idxs)
             # each item in scores includes scores and a mapping from scores to mentions
@@ -232,6 +237,9 @@ def update(
                     predicted docs in coref training.
                     """
                 )
+            if len(eg.predicted) < 2:
+                # no prediction possible for docs of length 0 or 1
+                continue
             preds, backprop = self.model.begin_update([eg.predicted])
             score_matrix, mention_idx = preds
             loss, d_scores = self.get_loss([eg], score_matrix, mention_idx)
diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py
@@ -37,6 +37,11 @@ def generate_train_data(prefix=DEFAULT_CLUSTER_PREFIX):
                 }
             },
         ),
+        (
+            # example short doc
+            "ok",
+            {"spans": {}}
+        )
     ]
     # fmt: on
     return data
@@ -83,11 +88,12 @@ def test_initialized(nlp):
 
 
 def test_initialized_short(nlp):
+    # test that short or empty docs don't fail
     nlp.add_pipe("experimental_coref")
     nlp.initialize()
     assert nlp.pipe_names == ["experimental_coref"]
-    text = "Hi there"
-    doc = nlp(text)
+    doc = nlp("Hi")
+    doc = nlp("")
 
 
 def test_coref_serialization(nlp):
@@ -148,7 +154,8 @@ def test_overfitting_IO(nlp, train_data):
 
 def test_tokenization_mismatch(nlp, train_data):
     train_examples = []
-    for text, annot in train_data:
+    # this is testing a specific test example, so just get the first doc
+    for text, annot in train_data[0:1]:
         eg = Example.from_dict(nlp.make_doc(text), annot)
         ref = eg.reference
         char_spans = {}
diff --git a/spacy_experimental/coref/tests/test_span_resolver.py b/spacy_experimental/coref/tests/test_span_resolver.py
@@ -79,6 +79,13 @@ def test_not_initialized(nlp):
     with pytest.raises(ValueError, match="E109"):
         nlp(text)
 
+def test_initialized_short(nlp):
+    # docs with one or no tokens should not fail
+    nlp.add_pipe("experimental_span_resolver")
+    nlp.initialize()
+    assert nlp.pipe_names == ["experimental_span_resolver"]
+    nlp("hi")
+    nlp("")
 
 def test_span_resolver_serialization(nlp):
     # Test that the span resolver component can be serialized