Merge branch 'master' into alon/master_local

peteriz · web-flow · commit 42b1486456bf · 2018-11-11T15:24:52.000+02:00
diff --git a/nlp_architect/api/intent_extraction_api.py b/nlp_architect/api/intent_extraction_api.py
@@ -22,7 +22,7 @@
 from nlp_architect.models.intent_extraction import MultiTaskIntentModel, Seq2SeqIntentModel
 from nlp_architect.utils.generic import pad_sentences
 from nlp_architect.utils.io import download_unlicensed_file
-from nlp_architect.utils.text import SpacyInstance
+from nlp_architect.utils.text import SpacyInstance, bio_to_spans
 
 nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
 
@@ -85,22 +85,20 @@ def _download_pretrained_model(self, prompt=True):
             print('Done.')
 
     def display_results(self, text_str, predictions, intent_type):
-        ret = {'annotation_set': []}
-        ret['doc_text'] = ' '.join([t for t in text_str])
-        counter = 0
+        ret = {'annotation_set': [], 'doc_text': ' '.join([t for t in text_str])}
         spans = []
-        for t, n in zip(text_str, predictions):
-            if n != 'O':
-                ret['annotation_set'].append(n.lower())
-                spans.append({
-                    'start': counter,
-                    'end': counter + len(t),
-                    'type': n.lower()
-                })
-            counter += len(t) + 1
+        available_tags = set()
+        for s, e, tag in bio_to_spans(text_str, predictions):
+            spans.append({
+                'start': s,
+                'end': e,
+                'type': tag
+            })
+            available_tags.add(tag)
+        ret['annotation_set'] = list(available_tags)
         ret['spans'] = spans
         ret['title'] = intent_type
-        return {"doc": ret, 'type': 'high_level'}
+        return {'doc': ret, 'type': 'high_level'}
 
     def vectorize(self, doc, vocab, char_vocab=None):
         words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\
diff --git a/nlp_architect/api/ner_api.py b/nlp_architect/api/ner_api.py
@@ -22,7 +22,7 @@
 from nlp_architect.models.ner_crf import NERCRF
 from nlp_architect.utils.generic import pad_sentences
 from nlp_architect.utils.io import download_unlicensed_file
-from nlp_architect.utils.text import SpacyInstance
+from nlp_architect.utils.text import SpacyInstance, bio_to_spans
 
 nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
 
@@ -89,24 +89,19 @@ def load_model(self):
 
     @staticmethod
     def pretty_print(text, tags):
-        mapped = [
-            {'index': idx, 'word': el, 'label': tags[idx]} for idx, el in enumerate(text)
-        ]
-        counter = 0
         spans = []
-        for obj in mapped:
-            if obj['label'] != 'O':
-                spans.append({
-                    'start': counter,
-                    'end': (counter + len(obj['word'])),
-                    'type': obj['label']
-                })
-            counter += len(obj['word']) + 1
+        for s, e, tag in bio_to_spans(text, tags):
+            spans.append({
+                'start': s,
+                'end': e,
+                'type': tag
+            })
         ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
         ret = {'doc_text': ' '.join(text),
                'annotation_set': list(ents),
                'spans': spans,
                'title': 'None'}
+        print({"doc": ret, 'type': 'high_level'})
         return {"doc": ret, 'type': 'high_level'}
 
     @staticmethod
diff --git a/nlp_architect/utils/text.py b/nlp_architect/utils/text.py
@@ -16,6 +16,7 @@
 import re
 import sys
 from os import path
+from typing import List, Tuple
 
 import spacy
 from nltk import WordNetLemmatizer
@@ -361,3 +362,32 @@ def extract_nps(annotation_list, text=None):
         assert len(text) == len(annotation_list), 'annotations/text length mismatch'
         return_markers = [' '.join(text[s:e]) for s, e in np_markers]
     return return_markers
+
+
+def bio_to_spans(text: List[str], tags: List[str]) -> List[Tuple[int, int, str]]:
+    """
+    Convert BIO tagged list of strings into span starts and ends
+    Args:
+        text: list of words
+        tags: list of tags
+
+    Returns:
+        tuple: list of start, end and tag of detected spans
+    """
+    pointer = 0
+    starts = []
+    for i, t, in enumerate(tags):
+        if t.startswith('B-'):
+            starts.append((i, pointer))
+        pointer += len(text[i]) + 1
+
+    spans = []
+    for s_i, s_char in starts:
+        label_str = tags[s_i][2:]
+        e = 0
+        e_char = len(text[s_i + e])
+        while len(tags) > s_i + e + 1 and tags[s_i + e + 1].startswith('I-'):
+            e += 1
+            e_char += 1 + len(text[s_i + e])
+        spans.append((s_char, s_char + e_char, label_str))
+    return spans