import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import json
from datetime import datetime
nlp = spacy.load("./output/model-best") # ner model trained on conll data
print(nlp.pipe_names)
# ['tok2vec', 'ner']
nlp.add_pipe("merge_entities") # added merge_entities pipeline
print(nlp.pipe_names)
# ['tok2vec', 'ner', 'merge_entities']
texts = ["I live in New York USA",
"Steve Smith is a cricketer"]
results = []
index = 0
print("Starting Spacy Ner")
now = datetime.now()
end_count = 1
n = 0
while n < end_count:
n += 1
for doc in nlp.pipe(texts):
index += 1
for ent in doc.ents:
temp = [index,ent.label_,ent.text,ent.start_char,ent.end_char]
results.append(temp)
elapsed_time = datetime.now() - now
print("Ner done",elapsed_time )
[[1, 'I-LOC', 'New', 10, 13],
[1, 'I-LOC', 'York', 14, 18],
[1, 'I-LOC', 'USA', 19, 22],
[2, 'I-PER', 'Steve', 0, 5],
[2, 'I-PER', 'Smith', 6, 11]]
I am expecting that the New York and Steve Smith should come as one word because we have used merge_entities in the pipeline.
Please let me know if I am doing anything worng.