Skip to content
This repository was archived by the owner on Nov 8, 2022. It is now read-only.

Commit 42b1486

Browse files
authored
Merge branch 'master' into alon/master_local
2 parents 9450aea + 11187c1 commit 42b1486

File tree

3 files changed

+50
-27
lines changed

3 files changed

+50
-27
lines changed

nlp_architect/api/intent_extraction_api.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from nlp_architect.models.intent_extraction import MultiTaskIntentModel, Seq2SeqIntentModel
2323
from nlp_architect.utils.generic import pad_sentences
2424
from nlp_architect.utils.io import download_unlicensed_file
25-
from nlp_architect.utils.text import SpacyInstance
25+
from nlp_architect.utils.text import SpacyInstance, bio_to_spans
2626

2727
nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
2828

@@ -85,22 +85,20 @@ def _download_pretrained_model(self, prompt=True):
8585
print('Done.')
8686

8787
def display_results(self, text_str, predictions, intent_type):
88-
ret = {'annotation_set': []}
89-
ret['doc_text'] = ' '.join([t for t in text_str])
90-
counter = 0
88+
ret = {'annotation_set': [], 'doc_text': ' '.join([t for t in text_str])}
9189
spans = []
92-
for t, n in zip(text_str, predictions):
93-
if n != 'O':
94-
ret['annotation_set'].append(n.lower())
95-
spans.append({
96-
'start': counter,
97-
'end': counter + len(t),
98-
'type': n.lower()
99-
})
100-
counter += len(t) + 1
90+
available_tags = set()
91+
for s, e, tag in bio_to_spans(text_str, predictions):
92+
spans.append({
93+
'start': s,
94+
'end': e,
95+
'type': tag
96+
})
97+
available_tags.add(tag)
98+
ret['annotation_set'] = list(available_tags)
10199
ret['spans'] = spans
102100
ret['title'] = intent_type
103-
return {"doc": ret, 'type': 'high_level'}
101+
return {'doc': ret, 'type': 'high_level'}
104102

105103
def vectorize(self, doc, vocab, char_vocab=None):
106104
words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\

nlp_architect/api/ner_api.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from nlp_architect.models.ner_crf import NERCRF
2323
from nlp_architect.utils.generic import pad_sentences
2424
from nlp_architect.utils.io import download_unlicensed_file
25-
from nlp_architect.utils.text import SpacyInstance
25+
from nlp_architect.utils.text import SpacyInstance, bio_to_spans
2626

2727
nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
2828

@@ -89,24 +89,19 @@ def load_model(self):
8989

9090
@staticmethod
9191
def pretty_print(text, tags):
92-
mapped = [
93-
{'index': idx, 'word': el, 'label': tags[idx]} for idx, el in enumerate(text)
94-
]
95-
counter = 0
9692
spans = []
97-
for obj in mapped:
98-
if obj['label'] != 'O':
99-
spans.append({
100-
'start': counter,
101-
'end': (counter + len(obj['word'])),
102-
'type': obj['label']
103-
})
104-
counter += len(obj['word']) + 1
93+
for s, e, tag in bio_to_spans(text, tags):
94+
spans.append({
95+
'start': s,
96+
'end': e,
97+
'type': tag
98+
})
10599
ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
106100
ret = {'doc_text': ' '.join(text),
107101
'annotation_set': list(ents),
108102
'spans': spans,
109103
'title': 'None'}
104+
print({"doc": ret, 'type': 'high_level'})
110105
return {"doc": ret, 'type': 'high_level'}
111106

112107
@staticmethod

nlp_architect/utils/text.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import re
1717
import sys
1818
from os import path
19+
from typing import List, Tuple
1920

2021
import spacy
2122
from nltk import WordNetLemmatizer
@@ -361,3 +362,32 @@ def extract_nps(annotation_list, text=None):
361362
assert len(text) == len(annotation_list), 'annotations/text length mismatch'
362363
return_markers = [' '.join(text[s:e]) for s, e in np_markers]
363364
return return_markers
365+
366+
367+
def bio_to_spans(text: List[str], tags: List[str]) -> List[Tuple[int, int, str]]:
368+
"""
369+
Convert BIO tagged list of strings into span starts and ends
370+
Args:
371+
text: list of words
372+
tags: list of tags
373+
374+
Returns:
375+
tuple: list of start, end and tag of detected spans
376+
"""
377+
pointer = 0
378+
starts = []
379+
for i, t, in enumerate(tags):
380+
if t.startswith('B-'):
381+
starts.append((i, pointer))
382+
pointer += len(text[i]) + 1
383+
384+
spans = []
385+
for s_i, s_char in starts:
386+
label_str = tags[s_i][2:]
387+
e = 0
388+
e_char = len(text[s_i + e])
389+
while len(tags) > s_i + e + 1 and tags[s_i + e + 1].startswith('I-'):
390+
e += 1
391+
e_char += 1 + len(text[s_i + e])
392+
spans.append((s_char, s_char + e_char, label_str))
393+
return spans

0 commit comments

Comments
 (0)