Use nlp.pipe in EntityRuler for phrase patterns in add_patterns (#4931)

Kabir Khan · web-flow · commit f6ed07b85c0b · 2020-02-16T18:17:47.000+01:00
* Fix ent_ids and labels properties when id attribute used in patterns

* use set for labels

* sort end_ids for comparison in entity_ruler tests

* fixing entity_ruler ent_ids test

* add to set

* Run make_doc optimistically if using phrase matcher patterns.

* remove unused coveragerc I was testing with

* format

* Refactor EntityRuler.add_patterns to use nlp.pipe for phrase patterns. Improves speed substantially.

* Removing old add_patterns function

* Fixing spacing

* Make sure token_patterns loaded as well, before generator was being emptied in from_disk
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
@@ -8,7 +8,7 @@
 from ..errors import Errors
 from ..compat import basestring_
 from ..util import ensure_path, to_disk, from_disk
-from ..tokens import Span
+from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 
 DEFAULT_ENT_ID_SEP = "||"
@@ -162,6 +162,7 @@ def ent_ids(self):
     @property
     def patterns(self):
         """Get all patterns that were added to the entity ruler.
+
         RETURNS (list): The original patterns, one dictionary per pattern.
 
         DOCS: https://spacy.io/api/entityruler#patterns
@@ -194,6 +195,7 @@ def add_patterns(self, patterns):
 
         DOCS: https://spacy.io/api/entityruler#add_patterns
         """
+
         # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
         try:
             current_index = self.nlp.pipe_names.index(self.name)
@@ -203,7 +205,33 @@ def add_patterns(self, patterns):
         except ValueError:
             subsequent_pipes = []
         with self.nlp.disable_pipes(subsequent_pipes):
+            token_patterns = []
+            phrase_pattern_labels = []
+            phrase_pattern_texts = []
+            phrase_pattern_ids = []
+
             for entry in patterns:
+                if isinstance(entry["pattern"], basestring_):
+                    phrase_pattern_labels.append(entry["label"])
+                    phrase_pattern_texts.append(entry["pattern"])
+                    phrase_pattern_ids.append(entry.get("id"))
+                elif isinstance(entry["pattern"], list):
+                    token_patterns.append(entry)
+
+            phrase_patterns = []
+            for label, pattern, ent_id in zip(
+                phrase_pattern_labels,
+                self.nlp.pipe(phrase_pattern_texts),
+                phrase_pattern_ids
+            ):
+                phrase_pattern = {
+                    "label": label, "pattern": pattern, "id": ent_id
+                }
+                if ent_id:
+                    phrase_pattern["id"] = ent_id
+                phrase_patterns.append(phrase_pattern)
+
+            for entry in token_patterns + phrase_patterns:
                 label = entry["label"]
                 if "id" in entry:
                     ent_label = label
@@ -212,8 +240,8 @@ def add_patterns(self, patterns):
                     self._ent_ids[key] = (ent_label, entry["id"])
 
                 pattern = entry["pattern"]
-                if isinstance(pattern, basestring_):
-                    self.phrase_patterns[label].append(self.nlp(pattern))
+                if isinstance(pattern, Doc):
+                    self.phrase_patterns[label].append(pattern)
                 elif isinstance(pattern, list):
                     self.token_patterns[label].append(pattern)
                 else:
@@ -226,6 +254,8 @@ def add_patterns(self, patterns):
     def _split_label(self, label):
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
 
+        label (str): The value of label in a pattern entry
+
         RETURNS (tuple): ent_label, ent_id
         """
         if self.ent_id_sep in label:
@@ -239,6 +269,9 @@ def _split_label(self, label):
     def _create_label(self, label, ent_id):
         """Join Entity label with ent_id if the pattern has an `id` attribute
 
+        label (str): The label to set for ent.label_
+        ent_id (str): The label
+
         RETURNS (str): The ent_label joined with configured `ent_id_sep`
         """
         if isinstance(ent_id, basestring_):
@@ -250,6 +283,7 @@ def from_bytes(self, patterns_bytes, **kwargs):
 
         patterns_bytes (bytes): The bytestring to load.
         **kwargs: Other config paramters, mostly for consistency.
+
         RETURNS (EntityRuler): The loaded entity ruler.
 
         DOCS: https://spacy.io/api/entityruler#from_bytes
@@ -292,6 +326,7 @@ def from_disk(self, path, **kwargs):
 
         path (unicode / Path): The JSONL file to load.
         **kwargs: Other config paramters, mostly for consistency.
+
         RETURNS (EntityRuler): The loaded entity ruler.
 
         DOCS: https://spacy.io/api/entityruler#from_disk
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
@@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will
 be restored and deserialized – including the entity ruler. This lets you ship
 powerful model packages with binary weights _and_ rules included!
 
+### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
+
+When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
+the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
+to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to 
+extract matches based on the pattern's POS signature.
+
+In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
+
+Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
+
+As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. 
+
+Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
+
+An easy workaround to make this function run faster is disabling the other language pipes
+while adding the phrase patterns.
+
+```python
+entityruler = EntityRuler(nlp)
+patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
+
+other_pipes = [p for p in nlp.pipe_names if p != "tagger"]
+with nlp.disable_pipes(*disable_pipes):
+    entityruler.add_patterns(patterns)
+```
+
 ## Combining models and rules {#models-rules}
 
 You can combine statistical and rule-based components in a variety of ways.